HEAD ======= <<<<<<< HEAD >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
<<<<<<< HEAD ======= >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564eSuggested citation: Madaga, Lavinia, Jordan Chamberlin, Bisrat Gebrekidan, Robert Hijmans, Nicholaus Kuboja, and Maxwell Mkondiwa. 2024. “Predictive mapping of wholesale grain prices for rural areas in Tanzania.” https://github.com/EiA2030-ex-ante/Tanzania-Price-Data
We are interested in estimating the prices of agricultural food commodities across space and time, on the basis of prices as observed at some market locations. Here we explore methods to model these prices, <<<<<<< HEAD using monthly data from across Tanzania over the period May 2021-July ======= using monthly data from across Tanzania over the period May 2021-June >>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e 2024.
This document describes the process of fitting a Random Forest model to predict these prices. The performance of the Random Forest model is evaluated using Root Mean Square Errors (RMSE) and R-Square as test statistics. We also compare the observed prices with the predicted prices using an out of sample dataset.
library(geodata)
## Warning: package 'geodata' was built under R version 4.3.3
library(lubridate)
library(terra)
library(data.table)
library(randomForest)
library(httr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
<<<<<<< HEAD
## Warning: package 'ggplot2' was built under R version 4.3.3
=======
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
library(pdp)
## Warning: package 'pdp' was built under R version 4.3.3
library(gridExtra)
library(stats)
library(dplyr)
library(stringr)
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Warning: package 'spam' was built under R version 4.3.3
<<<<<<< HEAD
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
library(ggplot2)
=======
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
This dataset contains price information for various crops across different regions and markets in Tanzania from 2021 to 2024. The data was acquired from Tanzania’s Ministry of Industry and Trade.
setwd("H:/Tanzania Price data/Datasets")
prices <- fread("Tanzania_Price_Data_AllCrops_with_Coordinates4.csv")
dim(prices)
<<<<<<< HEAD
## [1] 7912 21
=======
## [1] 7441 21
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
head(prices)
## Region Market Maize..min.price. Maize..max.price. Rice..min.price.
## 1: Arusha Arusha 43000 45000 140000
## 2: Dar es Salaam Temeke 46000 47000 120000
## 3: Dodoma Majengo 45000 50000 132000
## 4: Dodoma Kibaigwa 30000 42000 NA
## 5: Kagera Bukoba 33000 35000 110000
## 6: Manyara Babati 30000 45000 120000
## Rice..max.price. Sorghum..min.price. Sorghum..max.price.
## 1: 170000 60000 65000
## 2: 210000 80000 100000
## 3: 200000 50000 60000
## 4: NA 45000 48000
## 5: 140000 140000 150000
## 6: 170000 60000 80000
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## 1: 70000 75000
## 2: 80000 100000
## 3: 52000 64000
## 4: NA NA
## 5: 120000 140000
## 6: 80000 100000
## Finger.Millet..min.price. Finger.Millet..max.price. Wheat..min.price.
## 1: 120000 125000 70000
## 2: 175000 180000 110000
## 3: 200000 252000 NA
## 4: NA NA NA
## 5: 170000 180000 170000
## 6: 120000 130000 100000
## Wheat..max.price. Beans..min.price. Beans..max.price.
## 1: 78000 130000 165000
## 2: 120000 220000 260000
## 3: NA 200000 245000
## 4: NA NA NA
## 5: 180000 110000 170000
## 6: 120000 150000 180000
## Irish.Potatoes..min.price. Irish.Potatoes..max.price. Date Latitude
## 1: 70000 75000 5/5/2021 -3.36968
## 2: 75000 75000 5/5/2021 -6.85097
## 3: 56000 68000 5/5/2021 -6.17971
## 4: NA NA 5/5/2021 -6.08110
## 5: 60000 75000 5/5/2021 -1.33140
## 6: 60000 60000 5/5/2021 -4.21006
## Longitude
## 1: 36.68808
## 2: 39.25672
## 3: 35.74109
## 4: 36.64645
## 5: 31.81293
## 6: 35.74915
table(prices$Market)
##
## Arusha Babati Bariadi Buguruni Bukoba Geita
<<<<<<< HEAD
## 374 243 78 10 390 38
## Igawilo Ilala Iringa Kibaigwa Kigoma Kilombero
## 38 203 392 211 247 38
## Kinondoni Lindi Majengo Manyara Mbeya Mgandini
## 203 246 438 116 117 38
## Morogoro Moshi Mpanda Mpimbwe Mtwara Musoma
## 386 252 195 39 400 287
## Mwananyamala Mwanza Namfua Njombe Nyankumbu Pwani
## 38 349 38 174 38 78
## Shinyanga Singida Songea Songwe Sumbawanga Tabora
## 282 83 361 48 385 364
## Tandale Tandika Tanga Temeke Ubungo
## 55 64 361 169 46
=======
## 371 227 64 10 373 38
## Igawilo Ilala Iringa Kibaigwa Kigoma Kilimanjaro
## 24 187 375 208 227 13
## Kilombero Kinondoni Lindi Majengo Manyara Mbeya
## 24 202 232 408 115 117
## Mgandini Mnalani Morogoro Moshi Mpanda Mpimbwe
## 24 9 369 225 193 39
## Mtwara Musoma Mwananyamala Mwanza Namfua Njombe
## 384 270 24 333 24 172
## Nyankumbu Pwani Shinyanga Singida Songea Songwe
## 24 55 266 83 344 34
## Sumbawanga Tabora Tandale Tandika Tanga Temeke
## 368 347 41 50 359 153
## Ubungo Ujiji
## 32 4
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
sapply(prices, class)
## Region Market
## "character" "character"
## Maize..min.price. Maize..max.price.
## "integer" "integer"
## Rice..min.price. Rice..max.price.
## "integer" "integer"
## Sorghum..min.price. Sorghum..max.price.
## "integer" "integer"
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## "integer" "integer"
## Finger.Millet..min.price. Finger.Millet..max.price.
## "integer" "integer"
## Wheat..min.price. Wheat..max.price.
## "integer" "integer"
## Beans..min.price. Beans..max.price.
## "integer" "integer"
## Irish.Potatoes..min.price. Irish.Potatoes..max.price.
## "integer" "integer"
## Date Latitude
## "character" "numeric"
## Longitude
## "numeric"
# Convert to date format
prices$Date <- lubridate::mdy(prices$Date)
Check the Region and Market names and coodinates. Make sure the Region and Market names are harmonized and properly geocoded
unique(prices[Region=="Arusha",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Arusha -3.36968 36.68808
## 2: Kilombero -3.37324 36.67870
unique(prices[Region=="Dar es Salaam",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Temeke -6.850970 39.25672
## 2: Kinondoni -6.784070 39.27007
## 3: Ilala -6.829410 39.26289
## 4: Tandika -6.867912 39.25480
## 5: Buguruni -6.838380 39.24359
## 6: Tandale -6.795230 39.24085
## 7: Ubungo -6.793620 39.20966
## 8: Mwananyamala -6.788890 39.25840
unique(prices[Region=="Dodoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Majengo -6.17971 35.74109
## 2: Kibaigwa -6.08110 36.64645
unique(prices[Region=="Kagera",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bukoba -1.3314 31.81293
unique(prices[Region=="Manyara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Babati -4.21006 35.74915
## 2: Manyara -4.46011 37.20217
unique(prices[Region=="Rukwa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Sumbawanga -7.95239 31.62319
unique(prices[Region=="Mpanda",.(Market, Latitude, Longitude)])
## Empty data.table (0 rows and 3 cols): Market,Latitude,Longitude
unique(prices[Region=="Mtwara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mtwara -10.28009 40.18191
unique(prices[Region=="Tabora",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tabora -5.01972 32.80767
unique(prices[Region=="Tanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tanga -5.074260 39.09993
## 2: Mgandini -5.086235 39.09561
unique(prices[Region=="Iringa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Iringa -7.78001 35.69671
unique(prices[Region=="Kigoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
<<<<<<< HEAD
## 1: Kigoma -4.89697 29.65987
=======
## 1: Kigoma -4.89697 29.65987
## 2: Ujiji -4.90837 29.69203
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
unique(prices[Region=="Morogoro",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Morogoro -6.82771 37.66542
unique(prices[Region=="Mwanza",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mwanza -2.51969 32.90144
unique(prices[Region=="Mara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Musoma -1.49982 33.8083
unique(prices[Region=="Ruvuma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songea -10.67873 35.64836
unique(prices[Region=="Shinyanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Shinyanga -3.67226 33.43069
unique(prices[Region=="Kilimanjaro",.(Market, Latitude, Longitude)])
<<<<<<< HEAD
## Market Latitude Longitude
## 1: Moshi -3.34865 37.34352
=======
## Market Latitude Longitude
## 1: Moshi -3.34865 37.34352
## 2: Kilimanjaro -3.33854 37.32654
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
unique(prices[Region=="Mbeya",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mbeya -8.909940 33.45517
## 2: Igawilo -8.924246 33.56788
unique(prices[Region=="Katavi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mpanda -6.34295 31.07299
## 2: Mpimbwe -7.24425 31.81782
## 3: Majengo -6.34809 31.07055
unique(prices[Region=="Njombe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Njombe -9.33805 34.76949
unique(prices[Region=="Lindi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Lindi -9.995 39.708
unique(prices[Region=="Singida",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Singida -4.812610 34.7428
## 2: Namfua -4.821121 34.7470
unique(prices[Region=="Pwani",.(Market, Latitude, Longitude)])
<<<<<<< HEAD
## Market Latitude Longitude
## 1: Pwani -6.44221 38.90622
=======
## Market Latitude Longitude
## 1: Pwani -6.44221 38.90622
## 2: Mnalani -6.44221 38.90622
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
unique(prices[Region=="Simiyu",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bariadi -2.80355 33.98699
unique(prices[Region=="Geita",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Geita -2.870760 32.23408
## 2: Nyankumbu -2.895955 32.22871
unique(prices[Region=="Songwe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songwe -8.95179 33.24377
setnames(prices, old = "Maize..min.price.", new = "mai.price.min")
setnames(prices, old = "Rice..min.price.", new = "ric.price.min")
setnames(prices, old = "Sorghum..min.price.", new = "sor.price.min")
setnames(prices, old = "Bulrush.Millet..min.price.", new = "bul.price.min")
setnames(prices, old = "Finger.Millet..min.price.", new = "fin.price.min")
setnames(prices, old = "Wheat..min.price.", new = "whe.price.min")
setnames(prices, old = "Beans..min.price.", new = "bea.price.min")
setnames(prices, old = "Irish.Potatoes..min.price.", new = "pot.price.min")
setnames(prices, old = "Maize..max.price.", new = "mai.price.max")
setnames(prices, old = "Rice..max.price.", new = "ric.price.max")
setnames(prices, old = "Sorghum..max.price.", new = "sor.price.max")
setnames(prices, old = "Bulrush.Millet..max.price.", new = "bul.price.max")
setnames(prices, old = "Finger.Millet..max.price.", new = "fin.price.max")
setnames(prices, old = "Wheat..max.price.", new = "whe.price.max")
setnames(prices, old = "Beans..max.price.", new = "bea.price.max")
setnames(prices, old = "Irish.Potatoes..max.price.", new = "pot.price.max")
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "integer" "integer" "integer"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "integer" "integer" "integer" "integer" "integer"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "integer" "integer" "integer" "integer" "integer"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "integer" "integer" "integer" "Date" "numeric"
## Longitude
## "numeric"
#convert prices to numeric
prices$mai.price.min <- as.numeric(prices$mai.price.min)
prices$ric.price.min <- as.numeric(prices$ric.price.min)
prices$sor.price.min <- as.numeric(prices$sor.price.min)
prices$bul.price.min <- as.numeric(prices$bul.price.min)
prices$fin.price.min <- as.numeric(prices$fin.price.min)
prices$whe.price.min <- as.numeric(prices$whe.price.min)
prices$bea.price.min <- as.numeric(prices$bea.price.min)
prices$pot.price.min <- as.numeric(prices$pot.price.min)
prices$mai.price.max <- as.numeric(prices$mai.price.max)
prices$ric.price.max <- as.numeric(prices$ric.price.max)
prices$sor.price.max <- as.numeric(prices$sor.price.max)
prices$bul.price.max <- as.numeric(prices$bul.price.max)
prices$fin.price.max <- as.numeric(prices$fin.price.max)
prices$whe.price.max <- as.numeric(prices$whe.price.max)
prices$bea.price.max <- as.numeric(prices$bea.price.max)
prices$pot.price.max <- as.numeric(prices$pot.price.max)
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "numeric" "numeric" "numeric"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "numeric" "numeric" "numeric" "numeric" "numeric"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "numeric" "numeric" "numeric" "numeric" "numeric"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "numeric" "numeric" "numeric" "Date" "numeric"
## Longitude
## "numeric"
# convert to price per kg
prices$mai.price.min <- prices$mai.price.min/100
prices$ric.price.min <- prices$ric.price.min/100
prices$sor.price.min <- prices$sor.price.min/100
prices$bul.price.min <- prices$bul.price.min/100
prices$fin.price.min <- prices$fin.price.min/100
prices$whe.price.min <- prices$whe.price.min/100
prices$bea.price.min <- prices$bea.price.min/100
prices$pot.price.min <- prices$pot.price.min/100
prices$mai.price.max <- prices$mai.price.max/100
prices$ric.price.max <- prices$ric.price.max/100
prices$sor.price.max <- prices$sor.price.max/100
prices$bul.price.max <- prices$bul.price.max/100
prices$fin.price.max <- prices$fin.price.max/100
prices$whe.price.max <- prices$whe.price.max/100
prices$bea.price.max <- prices$bea.price.max/100
prices$pot.price.max <- prices$pot.price.max/100
# calculate average of min and max
prices$mai.price <- (prices$mai.price.min + prices$mai.price.max) / 2
prices$ric.price <- (prices$ric.price.min + prices$ric.price.max) / 2
prices$sor.price <- (prices$sor.price.min + prices$sor.price.max) / 2
prices$bul.price <- (prices$bul.price.min + prices$bul.price.max) / 2
prices$fin.price <- (prices$fin.price.min + prices$fin.price.max) / 2
prices$whe.price <- (prices$whe.price.min + prices$whe.price.max) / 2
prices$bea.price <- (prices$bea.price.min + prices$bea.price.max) / 2
prices$pot.price <- (prices$pot.price.min + prices$pot.price.max) / 2
#We can add dates by using the year and the month names
prices$Day <- day(prices$Date)
prices$Month <- month(prices$Date)
prices$Year <- year(prices$Date)
# drop unneccessary columns
prices <- prices[,!c("mai.price.min", "mai.price.max",
"ric.price.min", "ric.price.max",
"sor.price.min", "sor.price.max",
"bul.price.min", "bul.price.max",
"fin.price.min", "fin.price.max",
"whe.price.min", "whe.price.max",
"bea.price.min", "bea.price.max",
"pot.price.min", "pot.price.max")]
# calculate monthly mean prices by market
prices.monthly <- prices[, .(mai.price = mean(mai.price, na.rm = TRUE),
ric.price = mean(ric.price, na.rm = TRUE),
sor.price = mean(sor.price, na.rm = TRUE),
bul.price = mean(bul.price, na.rm = TRUE),
fin.price = mean(fin.price, na.rm = TRUE),
whe.price = mean(whe.price, na.rm = TRUE),
bea.price = mean(bea.price, na.rm = TRUE),
pot.price = mean(pot.price, na.rm = TRUE)),
by=.(Region, Market, Month, Year, Latitude, Longitude)]
# reshape to long (so that prices for different commodities can be simultaneously estimated)
prices.monthly
## Region Market Month Year Latitude Longitude mai.price ric.price
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 469.0000 1530.000
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 485.0000 1650.000
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 501.5000 1592.000
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 375.0000 NaN
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 426.2500 1260.000
## ---
<<<<<<< HEAD
## 860: Mwanza Mwanza 7 2024 -2.51969 32.90144 725.0000 2350.000
## 861: Pwani Pwani 7 2024 -6.44221 38.90622 835.7143 1946.429
## 862: Simiyu Bariadi 7 2024 -2.80355 33.98699 660.0000 1450.000
## 863: Kigoma Kigoma 7 2024 -4.89697 29.65987 578.5714 1417.857
## 864: Rukwa Sumbawanga 7 2024 -7.95239 31.62319 546.7857 1653.571
=======
## 836: Mwanza Mwanza 6 2024 -2.51969 32.90144 725.0000 2350.000
## 837: Pwani Mnalani 6 2024 -6.44221 38.90622 772.2222 2000.000
## 838: Simiyu Bariadi 6 2024 -2.80355 33.98699 563.3333 1316.667
## 839: Kigoma Kigoma 6 2024 -4.89697 29.65987 504.2222 1488.889
## 840: Rukwa Sumbawanga 6 2024 -7.95239 31.62319 533.3333 1750.000
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## sor.price bul.price fin.price whe.price bea.price pot.price
## 1: 695.000 761.000 1333.000 793.0000 1545.000 745.0000
## 2: 900.000 900.000 1775.000 1230.0000 2420.000 730.0000
## 3: 558.000 581.000 1752.000 NaN 2075.000 618.0000
## 4: 437.500 NaN NaN NaN NaN NaN
## 5: 1375.000 1337.500 1637.500 1637.5000 1300.000 675.0000
## ---
<<<<<<< HEAD
## 860: 1428.571 1421.429 1621.429 NaN 2528.571 1142.8571
## 861: 1442.857 1550.000 1857.143 1975.0000 2967.857 1735.7143
## 862: 1400.000 1739.286 1728.571 2457.1429 2750.000 1500.0000
## 863: 1821.429 1892.857 1907.143 1867.8571 2228.571 1075.0000
## 864: NaN NaN 975.000 719.6429 2135.714 641.0714
=======
## 836: 1400.000 1350.000 1550.000 NaN 2400.000 1000.0000
## 837: 1522.222 1544.444 2022.222 2400.0000 3116.667 1800.0000
## 838: 1300.000 1788.889 1716.667 2316.6667 2805.556 1500.0000
## 839: 1266.667 1266.667 1433.333 2122.2222 2216.667 1127.7778
## 840: NaN NaN 1088.889 780.5556 1888.889 652.7778
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
prices.monthly.long <- melt(prices.monthly, id.vars=c('Region', 'Market', 'Month', 'Year', 'Latitude', 'Longitude'),)
# rename columns
setnames(prices.monthly.long, old="variable", new="Crop")
setnames(prices.monthly.long, old="value", new="pkg")
# replace crop names
prices.monthly.long[Crop == "mai.price", Crop := "Maize"]
prices.monthly.long[Crop == "ric.price", Crop := "Rice"]
prices.monthly.long[Crop == "sor.price", Crop := "Sorghum"]
prices.monthly.long[Crop == "bul.price", Crop := "B.Millet"]
prices.monthly.long[Crop == "fin.price", Crop := "F.Millet"]
prices.monthly.long[Crop == "whe.price", Crop := "Wheat"]
prices.monthly.long[Crop == "bea.price", Crop := "Beans"]
prices.monthly.long[Crop == "pot.price", Crop := "Potato"]
# Reset the factor levels to updated levels
prices.monthly.long[, Crop := factor(Crop)]
# Check the unique values again
unique(prices.monthly.long$Crop)
## [1] Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# generate dummies to use in place of factors (for later spatial predictions, which are struggling with factors)
prices.monthly.long[, maize := ifelse(Crop == "Maize",1,0)]
prices.monthly.long[, rice := ifelse(Crop == "Rice",1,0)]
prices.monthly.long[, sorghum := ifelse(Crop == "Sorghum",1,0)]
prices.monthly.long[, bmillet := ifelse(Crop == "B.Millet",1,0)]
prices.monthly.long[, fmillet := ifelse(Crop == "F.Millet",1,0)]
prices.monthly.long[, wheat := ifelse(Crop == "Wheat",1,0)]
prices.monthly.long[, beans := ifelse(Crop == "Beans",1,0)]
prices.monthly.long[, potato := ifelse(Crop == "Potato",1,0)]
prices.monthly.long[, jan := ifelse(Month == 1 , 1, 0)]
prices.monthly.long[, feb := ifelse(Month == 2 , 1, 0)]
prices.monthly.long[, mar := ifelse(Month == 3 , 1, 0)]
prices.monthly.long[, apr := ifelse(Month == 4 , 1, 0)]
prices.monthly.long[, may := ifelse(Month == 5 , 1, 0)]
prices.monthly.long[, jun := ifelse(Month == 6 , 1, 0)]
prices.monthly.long[, jul := ifelse(Month == 7 , 1, 0)]
prices.monthly.long[, aug := ifelse(Month == 8 , 1, 0)]
prices.monthly.long[, sep := ifelse(Month == 9 , 1, 0)]
prices.monthly.long[, oct := ifelse(Month == 10 , 1, 0)]
prices.monthly.long[, nov := ifelse(Month == 11 , 1, 0)]
prices.monthly.long[, dec := ifelse(Month == 12 , 1, 0)]
# replace NaN with NAs in the price observations
prices.monthly.long[is.nan(pkg), pkg := NA]
# Remove observations with missing observations
prices.monthly.long <- na.omit(prices.monthly.long)
# bring in raster stack as predictors
geodata_path("H:/Tanzania Price data/Datasets/geodata")
list.files("H:/Tanzania Price data/Datasets/geodata", recursive=TRUE)
## [1] "climate/wc2.1_country/TZA_wc2.1_30s_bio.tif"
## [2] "travel/travel_time_to_cities_u5.tif"
## [3] "TRUE/gadm/gadm41_TZA_0_pk.rds"
## [4] "TRUE/gadm/gadm41_TZA_1_pk.rds"
## [5] "TRUE/gadm/gadm41_TZA_2_pk.rds"
## [6] "TRUE/gadm/gadm41_TZA_3_pk.rds"
## [7] "TRUE/spam/spam2017v2r1_ssa_area.zip"
## [8] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif"
## [9] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_H.tif"
## [10] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_I.tif"
## [11] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_L.tif"
## [12] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_R.tif"
## [13] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_S.tif"
## [14] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_A.tif"
## [15] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_H.tif"
## [16] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_I.tif"
## [17] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_L.tif"
## [18] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif"
## [19] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_S.tif"
## [20] "TRUE/spam/spam2017v2r1_ssa_yield.zip"
## [21] "TRUE/travel/travel_time_to_cities_u5.tif"
## [22] "TRUE/travel/travel_time_to_ports_1.tif"
## [23] "TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif"
# tza0 <- gadm(country="TZA", level=0)
# tza1 <- gadm(country="TZA", level=1)
# tza2 <- gadm(country="TZA", level=2)
# tza3 <- gadm(country="TZA", level=3)
tza0 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_0_pk.rds")
tza1 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_1_pk.rds")
tza2 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_2_pk.rds")
tza3 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_3_pk.rds")
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# see if these show up correctly
plot(tza1)
plot(mypts, col="Red", add=TRUE)
<<<<<<< HEAD
# text(mypts, label="Market")
# create reference raster
tza_extent <- ext(tza1) |> floor()
r <- crop(rast(res=1/12), tza_extent)
## Interpolate
#xy <- as.matrix(mypts[,c("Longitude", "Latitude")])
xy <- geom(mypts)[,c("y","x")]
#tps <- Tps(xy, p$spatial)
tps <- Tps(xy, mypts$pkg)
sp <- interpolate(r, tps)
sp <- mask(sp, tza1)
plot(sp)
lines(tza1)
<<<<<<< HEAD
rf <- randomForest(pkg ~ Longitude + Latitude , data=mypts)
sp3 <- interpolate(r, rf, xyNames=c("Longitude", "Latitude"))
sp3 <- mask(sp3, tza1)
plot(sp3)
lines(tza1)
<<<<<<< HEAD
The covariates used include a mix of crop-specific indicators, temporal variables to capture monthly and yearly effects, geographical coordinates, accessibility measures, climatic conditions, and lagged rainfall to account for delayed effects of weather on crop prices.
ttcity <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_cities_u5.tif")
ttport <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_ports_1.tif")
clm <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif")
area <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif")
yield <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif")
popd <- rast("gpw_v4_population_density_rev11_2020_10m.tif")
names(ttcity) <- c("ttcity_u5") ## travel time cities of 100k or more
names(ttport) <- c("ttport_1") ## travel time to major ports
names(clm) <- gsub("wc2.1_30s_", "", names(clm))
names(area) <- c("MAI_ARE") # SPAM maize area 2010
names(yield) <- c("MAI_YLD") # SPAM maize yield 2010
names(popd) <- c("popdens") # GPW4
comment(ttcity) <- "travel time to cities 100k or more"
comment(ttport) <- "travel time to major ports"
comment(popd) <- "population density 2020 (GPW4 @ 10dm)"
comment(clm)[1] <-"BIO1 = Annual Mean Temperature"
comment(clm)[2] <-"BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))"
comment(clm)[3] <-"BIO3 = Isothermality (BIO2/BIO7) (×100)"
comment(clm)[4] <-"BIO4 = Temperature Seasonality (standard deviation ×100)"
comment(clm)[5] <-"BIO5 = Max Temperature of Warmest Month"
comment(clm)[6] <-"BIO6 = Min Temperature of Coldest Month"
comment(clm)[7] <-"BIO7 = Temperature Annual Range (BIO5-BIO6)"
comment(clm)[8] <-"BIO8 = Mean Temperature of Wettest Quarter"
comment(clm)[9] <-"BIO9 = Mean Temperature of Driest Quarter"
comment(clm)[10] <-"BIO10 = Mean Temperature of Warmest Quarter"
comment(clm)[11] <-"BIO11 = Mean Temperature of Coldest Quarter"
comment(clm)[12] <-"BIO12 = Annual Precipitation"
comment(clm)[13] <-"BIO13 = Precipitation of Wettest Month"
comment(clm)[14] <-"BIO14 = Precipitation of Driest Month"
comment(clm)[15] <-"BIO15 = Precipitation Seasonality (Coefficient of Variation)"
comment(clm)[16] <-"BIO16 = Precipitation of Wettest Quarter"
comment(clm)[17] <-"BIO17 = Precipitation of Driest Quarter"
comment(clm)[18] <-"BIO18 = Precipitation of Warmest Quarter"
comment(clm)[19] <-"BIO19 = Precipitation of Coldest Quarter"
ttcity <- resample(ttcity, r)
ttport <- resample(ttport, r)
clm <- resample(clm, r)
area <- resample(area, r)
popd <- resample(popd, r)
freq(is.na(area))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
area <- classify(area, cbind(NA,0))
yield <- resample(yield, r)
freq(is.na(yield))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
yield <- classify(yield, cbind(NA,0))
# check again
compareGeom(ttcity, ttport, clm, area, yield, popd)
## [1] TRUE
latgrd <- longrd <- r
latgrd[] <- yFromCell(latgrd, 1:ncell(latgrd))
longrd[] <- xFromCell(longrd, 1:ncell(longrd))
names(latgrd) <- c("latitude")
names(longrd) <- c("longitude")
rstack <- c(ttcity, ttport, clm, area, yield, popd, latgrd, longrd)
names(rstack)
## [1] "ttcity_u5" "ttport_1" "bio_1" "bio_2" "bio_3" "bio_4"
## [7] "bio_5" "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [13] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15" "bio_16"
## [19] "bio_17" "bio_18" "bio_19" "MAI_ARE" "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
# create focal mean to extract from (as alternative to using buffers for extraction, which are not supported in terra)
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack, w=fm, fun="mean", na.policy="all", fillvalue=NA, # na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
# extract values to dataset -- use a 20km buffer
# do a focal sum of 20km radius - this is about 0.18 of a decimal degree... 0.18*112=20.16
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack2, w=fm, fun="sum", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
chirps_path <- "H:/Tanzania Price data/chirps_data"
chirps_files <- list.files(chirps_path, pattern = ".tif$", full.names = TRUE)
# Read all CHIRPS data files into a SpatRaster collection
chirps_rasters <- rast(chirps_files)
#crop to Tanzania boundary
Chirps_Tz <- crop(chirps_rasters, tza1)
writeRaster(Chirps_Tz, "Tz_chirps_monthly_croped.tif", overwrite=TRUE)
Tz_chirps_monthly <- terra::rast("Tz_chirps_monthly_croped.tif")
Tz_chirps_monthly
## class : SpatRaster
<<<<<<< HEAD
## dimensions : 215, 222, 45 (nrow, ncol, nlyr)
=======
## dimensions : 215, 222, 44 (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source : Tz_chirps_monthly_croped.tif
## names : chirp~20.11, chirp~20.12, chirp~21.01, chirp~21.02, chirp~21.03, chirp~21.04, ...
## min values : -9999.0000, -9999.0000, -9999.000, -9999.0000, -9999.0000, -9999.0000, ...
## max values : 459.7112, 504.3329, 508.448, 585.3241, 750.1949, 960.3376, ...
#Replace -9999 with NA
Tz_chirps_monthly <- classify(Tz_chirps_monthly, cbind(-9999,NA))
#extract layer names
layer_names <- names(Tz_chirps_monthly)
layer_names
## [1] "chirps-v2.0.2020.11" "chirps-v2.0.2020.12" "chirps-v2.0.2021.01"
## [4] "chirps-v2.0.2021.02" "chirps-v2.0.2021.03" "chirps-v2.0.2021.04"
## [7] "chirps-v2.0.2021.05" "chirps-v2.0.2021.06" "chirps-v2.0.2021.07"
## [10] "chirps-v2.0.2021.08" "chirps-v2.0.2021.09" "chirps-v2.0.2021.10"
## [13] "chirps-v2.0.2021.11" "chirps-v2.0.2021.12" "chirps-v2.0.2022.01"
## [16] "chirps-v2.0.2022.02" "chirps-v2.0.2022.03" "chirps-v2.0.2022.04"
## [19] "chirps-v2.0.2022.05" "chirps-v2.0.2022.06" "chirps-v2.0.2022.07"
## [22] "chirps-v2.0.2022.08" "chirps-v2.0.2022.09" "chirps-v2.0.2022.10"
## [25] "chirps-v2.0.2022.11" "chirps-v2.0.2022.12" "chirps-v2.0.2023.01"
## [28] "chirps-v2.0.2023.02" "chirps-v2.0.2023.03" "chirps-v2.0.2023.04"
## [31] "chirps-v2.0.2023.05" "chirps-v2.0.2023.06" "chirps-v2.0.2023.07"
## [34] "chirps-v2.0.2023.08" "chirps-v2.0.2023.09" "chirps-v2.0.2023.10"
## [37] "chirps-v2.0.2023.11" "chirps-v2.0.2023.12" "chirps-v2.0.2024.01"
## [40] "chirps-v2.0.2024.02" "chirps-v2.0.2024.03" "chirps-v2.0.2024.04"
<<<<<<< HEAD
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06" "chirps-v2.0.2024.07"
=======
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06"
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# We need to create a sequence of dates from the layer names
# Extract year and month from layer names and convert to Date
dates <- as.Date(paste0(sub("chirps-v2.0\\.", "", layer_names), "-01"), format = "%Y.%m-%d")
dates
## [1] "2020-11-01" "2020-12-01" "2021-01-01" "2021-02-01" "2021-03-01"
## [6] "2021-04-01" "2021-05-01" "2021-06-01" "2021-07-01" "2021-08-01"
## [11] "2021-09-01" "2021-10-01" "2021-11-01" "2021-12-01" "2022-01-01"
## [16] "2022-02-01" "2022-03-01" "2022-04-01" "2022-05-01" "2022-06-01"
## [21] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [26] "2022-12-01" "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01"
## [31] "2023-05-01" "2023-06-01" "2023-07-01" "2023-08-01" "2023-09-01"
## [36] "2023-10-01" "2023-11-01" "2023-12-01" "2024-01-01" "2024-02-01"
<<<<<<< HEAD
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01" "2024-07-01"
=======
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01"
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# Assign these dates to the SpatRaster object
time(Tz_chirps_monthly) <- dates
#rename the layers to the formatted dates
names(Tz_chirps_monthly) <- dates
# Check the SpatRaster object
print(Tz_chirps_monthly)
## class : SpatRaster
<<<<<<< HEAD
## dimensions : 215, 222, 45 (nrow, ncol, nlyr)
=======
## dimensions : 215, 222, 44 (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 5.859746, 1.903993, 0.8161677, 0.3187093, 0.9539621, 1.17571, ...
## max values : 459.711212, 504.332947, 508.4479980, 585.3240967, 750.1949463, 960.33765, ...
<<<<<<< HEAD
## time (days) : 2020-11-01 to 2024-07-01
=======
## time (days) : 2020-11-01 to 2024-06-01
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# do a focal mean of 100km radius - this is about 0.9 of a decimal degree... 0.9009*112=100.9008
# Calculate the focal mean for each layer (month)
fm_r <- focalMat(Tz_chirps_monthly, d=0.9, type='circle', fillNA=FALSE)
Rainfall_focal_sum_100km <- focal(Tz_chirps_monthly, w=fm_r, fun="mean", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE)
# Check the result
Rainfall_focal_sum_100km
## class : SpatRaster
<<<<<<< HEAD
## dimensions : 215, 222, 45 (nrow, ncol, nlyr)
=======
## dimensions : 215, 222, 44 (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.79649, 27.27837, 8.092316, 4.489223, 11.30184, 15.74836, ...
## max values : 313.14248, 326.88227, 416.590468, 395.422070, 464.67840, 569.81990, ...
<<<<<<< HEAD
## time (days) : 2020-11-01 to 2024-07-01
=======
## time (days) : 2020-11-01 to 2024-06-01
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
Rainfall <- Rainfall_focal_sum_100km
#Resample
Rainfall_res <- resample(Rainfall, r)
Rainfall_res
## class : SpatRaster
<<<<<<< HEAD
## dimensions : 144, 144, 45 (nrow, ncol, nlyr)
=======
## dimensions : 144, 144, 44 (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.89972, 32.9472, 8.116516, 4.527868, 11.83634, 15.90436, ...
## max values : 310.41034, 326.4339, 416.389160, 395.367279, 462.19266, 550.26270, ...
<<<<<<< HEAD
## time (days) : 2020-11-01 to 2024-07-01
=======
## time (days) : 2020-11-01 to 2024-06-01
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
Here we define function that calculates 6 month lag sum of rainfall for each month in our data. The output is raster datsets.
calculate_lagged_sum <- function(raster_stack, num_months = 6) {
# Get the time vector from the raster stack
time_vector <- time(raster_stack)
# Initialize list to store lagged sum rasters
lagged_sum_rasters <- vector("list", length(time_vector))
# Loop through each layer in the raster stack
for (i in seq_along(time_vector)) {
if (i > num_months) { # We need at least 'num_months' previous layers to calculate the lagged sum
# Determine the start and end dates for the lag period
end_date <- time_vector[i] # Date of the current layer being processed
start_date <- end_date %m-% months(num_months) #The date num_months before the end_date
# Select the layers that fall within the lag period
lag_period_layers <- raster_stack[[which(time_vector > start_date & time_vector <= end_date)]]
# Calculate the sum of the selected layers
if (nlyr(lag_period_layers) == num_months) {
lagged_sum_rasters[[i]] <- sum(lag_period_layers, na.rm = TRUE)
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
}
# Combine the lagged sum rasters into a single raster stack, excluding empty rasters
lagged_sum_stack <- rast(lagged_sum_rasters)
# Set the names for the layers in the lagged sum stack
names(lagged_sum_stack) <- names(raster_stack)[!is.na(lagged_sum_rasters)]
return(lagged_sum_stack)
}
# Calculate the 6-month lagged sum for each period in the raster stack
lagged_rainfall_sum <- calculate_lagged_sum(Rainfall_res, num_months = 6)
# Remove the first 6 layers from the raster stack since they are empty
lagged_rainfall_sum_filtered <- lagged_rainfall_sum[[7:nlyr(lagged_rainfall_sum)]]
# check the result
print(lagged_rainfall_sum_filtered)
## class : SpatRaster
<<<<<<< HEAD
## dimensions : 144, 144, 39 (nrow, ncol, nlyr)
=======
## dimensions : 144, 144, 38 (nrow, ncol, nlyr)
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2021-05-01, 2021-06-01, 2021-07-01, 2021-08-01, 2021-09-01, 2021-10-01, ...
## min values : 172.7856, 105.5447, 105.4121, 105.1864, 26.20571, 7.070483, ...
## max values : 1512.4689, 1355.9669, 1052.9220, 1051.9214, 1038.13874, 643.716021, ...
names(lagged_rainfall_sum_filtered) <- paste0(names(lagged_rainfall_sum_filtered), "_rain.sum.lag")
plot(lagged_rainfall_sum_filtered)
## We'll have to include lagged_rainfall_sum_filtered in the predictor stack.
rstack
## class : SpatRaster
## dimensions : 144, 144, 26 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : ttcity_u5, ttport_1, bio_1, bio_2, bio_3, bio_4, ...
## min values : 3.350081e-02, 997.9427, 4.333545, 6.546645, 53.85881, 19.75255, ...
## max values : 1.166560e+03, 3043.3459, 28.472235, 15.285786, 93.14099, 266.33594, ...
names(rstack)
## [1] "ttcity_u5" "ttport_1" "bio_1" "bio_2" "bio_3" "bio_4"
## [7] "bio_5" "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [13] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15" "bio_16"
## [19] "bio_17" "bio_18" "bio_19" "MAI_ARE" "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
rstack3 <- c(rstack, lagged_rainfall_sum_filtered)
names(rstack3)
## [1] "ttcity_u5" "ttport_1"
## [3] "bio_1" "bio_2"
## [5] "bio_3" "bio_4"
## [7] "bio_5" "bio_6"
## [9] "bio_7" "bio_8"
## [11] "bio_9" "bio_10"
## [13] "bio_11" "bio_12"
## [15] "bio_13" "bio_14"
## [17] "bio_15" "bio_16"
## [19] "bio_17" "bio_18"
## [21] "bio_19" "MAI_ARE"
## [23] "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
## [27] "2021-05-01_rain.sum.lag" "2021-06-01_rain.sum.lag"
## [29] "2021-07-01_rain.sum.lag" "2021-08-01_rain.sum.lag"
## [31] "2021-09-01_rain.sum.lag" "2021-10-01_rain.sum.lag"
## [33] "2021-11-01_rain.sum.lag" "2021-12-01_rain.sum.lag"
## [35] "2022-01-01_rain.sum.lag" "2022-02-01_rain.sum.lag"
## [37] "2022-03-01_rain.sum.lag" "2022-04-01_rain.sum.lag"
## [39] "2022-05-01_rain.sum.lag" "2022-06-01_rain.sum.lag"
## [41] "2022-07-01_rain.sum.lag" "2022-08-01_rain.sum.lag"
## [43] "2022-09-01_rain.sum.lag" "2022-10-01_rain.sum.lag"
## [45] "2022-11-01_rain.sum.lag" "2022-12-01_rain.sum.lag"
## [47] "2023-01-01_rain.sum.lag" "2023-02-01_rain.sum.lag"
## [49] "2023-03-01_rain.sum.lag" "2023-04-01_rain.sum.lag"
## [51] "2023-05-01_rain.sum.lag" "2023-06-01_rain.sum.lag"
## [53] "2023-07-01_rain.sum.lag" "2023-08-01_rain.sum.lag"
## [55] "2023-09-01_rain.sum.lag" "2023-10-01_rain.sum.lag"
## [57] "2023-11-01_rain.sum.lag" "2023-12-01_rain.sum.lag"
## [59] "2024-01-01_rain.sum.lag" "2024-02-01_rain.sum.lag"
## [61] "2024-03-01_rain.sum.lag" "2024-04-01_rain.sum.lag"
<<<<<<< HEAD
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
## [65] "2024-07-01_rain.sum.lag"
=======
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
#Extract to the point dataset
extr1 <- terra::extract(rstack3, mypts, method = "bilinear")
mypts <- cbind(mypts, extr1)
# Remove the ID column from the dataset
mypts <- mypts[, !names(mypts) %in% "ID"]
head(mypts)
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1 Arusha Arusha 5 2021 -3.36968 36.68808 Maize 469.00 1 0
## 2 Dar es Salaam Temeke 5 2021 -6.85097 39.25672 Maize 485.00 1 0
## 3 Dodoma Majengo 5 2021 -6.17971 35.74109 Maize 501.50 1 0
## 4 Dodoma Kibaigwa 5 2021 -6.08110 36.64645 Maize 375.00 1 0
## 5 Kagera Bukoba 5 2021 -1.33140 31.81293 Maize 426.25 1 0
## 6 Manyara Babati 5 2021 -4.21006 35.74915 Maize 375.00 1 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## sep oct nov dec ttcity_u5 ttport_1 bio_1 bio_2 bio_3 bio_4
## 1 0 0 0 0 6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2 0 0 0 0 0.5137705 1692.300 25.93087 8.970697 67.33757 153.60644
## 3 0 0 0 0 11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4 0 0 0 0 83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5 0 0 0 0 17.1729083 2006.820 20.84789 8.869465 83.92802 38.70414
## 6 0 0 0 0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
## bio_5 bio_6 bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375 882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572 582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717 649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068 768.0339
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18 bio_19
## 1 237.8160 7.36089814 92.10601 482.4729 24.95659498 276.6105 32.88688141
## 2 264.3072 28.32696315 76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 133.9309 0.01660535 115.12207 375.4677 0.08605806 284.0681 0.08605806
## 4 129.1152 0.21909479 100.22821 368.5410 2.45863517 320.1588 4.15077049
## 5 346.9308 43.30159629 58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006 0.91639794 89.44063 381.6907 6.74877687 325.6114 12.35086025
## MAI_ARE MAI_YLD popdens latitude longitude
## 1 3.107393e+02 1.528850e+02 994.5706 -3.36968 36.68808
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097 39.25672
## 3 1.283795e-04 9.139631e-03 352.5410 -6.17971 35.74109
## 4 1.489781e+03 3.828877e+02 101.0565 -6.08110 36.64645
## 5 6.446327e+02 1.834932e+03 280.9803 -1.33140 31.81293
## 6 5.496486e+02 1.917829e+03 219.1393 -4.21006 35.74915
## 2021-05-01_rain.sum.lag 2021-06-01_rain.sum.lag 2021-07-01_rain.sum.lag
## 1 541.4019 482.1614 435.6677
## 2 809.3149 777.8958 722.6213
## 3 637.1213 535.1131 365.9182
## 4 656.0751 567.7319 416.4267
## 5 1149.9932 937.6197 853.1074
## 6 576.1356 490.7897 395.8152
## 2021-08-01_rain.sum.lag 2021-09-01_rain.sum.lag 2021-10-01_rain.sum.lag
## 1 386.3060 320.00676 161.93116
## 2 649.9489 604.59631 337.22359
## 3 183.7701 52.54273 15.42844
## 4 277.1030 153.91512 66.15258
## 5 811.6691 763.24715 615.07899
## 6 289.9778 154.11524 68.29101
## 2021-11-01_rain.sum.lag 2021-12-01_rain.sum.lag 2022-01-01_rain.sum.lag
## 1 101.83447 179.80052 232.6533
## 2 216.69636 288.29559 406.3529
## 3 19.66440 70.12616 416.5519
## 4 49.56118 106.72427 413.7123
## 5 488.90052 605.55387 808.6724
## 6 46.47620 108.69646 245.5580
## 2022-02-01_rain.sum.lag 2022-03-01_rain.sum.lag 2022-04-01_rain.sum.lag
## 1 330.6131 383.2848 540.3073
## 2 490.8561 534.5386 763.3206
## 3 817.0083 917.8080 1000.7087
## 4 665.6779 764.1859 927.5299
## 5 899.5895 957.6149 1138.6710
## 6 441.5905 510.5650 641.9617
## 2022-05-01_rain.sum.lag 2022-06-01_rain.sum.lag 2022-07-01_rain.sum.lag
## 1 530.1299 455.6274 403.9131
## 2 816.5829 729.6579 636.5197
## 3 997.3701 947.9434 601.5170
## 4 958.6012 903.3041 595.9210
## 5 1219.1735 1115.3495 905.1190
## 6 643.2443 581.5847 444.9141
## 2022-08-01_rain.sum.lag 2022-09-01_rain.sum.lag 2022-10-01_rain.sum.lag
## 1 305.5547 252.9373 100.53888
## 2 552.9257 495.7899 268.19916
## 3 201.0583 100.2519 14.64072
## 4 344.1166 244.4002 79.55153
## 5 853.5636 777.4608 541.86704
## 6 248.8724 179.9022 48.53265
## 2022-11-01_rain.sum.lag 2022-12-01_rain.sum.lag 2023-01-01_rain.sum.lag
## 1 118.72860 197.4710 209.2468
## 2 249.91483 366.5402 424.9013
## 3 24.33803 210.4590 364.0985
## 4 63.76563 237.8550 386.3680
## 5 469.63839 598.3139 701.7206
## 6 63.04448 191.9892 242.6080
## 2023-02-01_rain.sum.lag 2023-03-01_rain.sum.lag 2023-04-01_rain.sum.lag
## 1 215.1493 279.3411 509.3222
## 2 470.6298 548.0837 839.2660
## 3 459.6726 564.8242 612.5806
## 4 488.0359 587.2738 698.1311
## 5 667.8848 832.3617 977.3868
## 6 281.6344 387.7473 521.9884
## 2023-05-01_rain.sum.lag 2023-06-01_rain.sum.lag 2023-07-01_rain.sum.lag
## 1 506.2685 448.0443 434.9563
## 2 867.6821 794.6204 712.8424
## 3 603.2209 416.3525 262.7130
## 4 703.3618 531.7645 383.2904
## 5 939.2760 855.6117 744.2889
## 6 509.0339 383.3928 332.5788
## 2023-08-01_rain.sum.lag 2023-09-01_rain.sum.lag 2023-10-01_rain.sum.lag
## 1 430.3774 366.50878 151.77184
## 2 671.7647 605.81817 375.59469
## 3 167.1400 61.98845 14.85932
## 4 281.9979 182.81237 74.83833
## 5 701.5641 576.55398 478.95586
## 6 293.7072 187.82502 55.57329
## 2023-11-01_rain.sum.lag 2023-12-01_rain.sum.lag 2024-01-01_rain.sum.lag
## 1 265.66469 299.4620 435.0957
## 2 742.63087 865.3863 1088.0394
## 3 61.64418 263.3634 560.4381
## 4 140.44970 310.9307 640.9717
## 5 677.93283 840.0104 941.7438
## 6 137.00931 245.0625 448.6259
## 2024-02-01_rain.sum.lag 2024-03-01_rain.sum.lag 2024-04-01_rain.sum.lag
## 1 531.3442 668.3936 896.7951
## 2 1108.6897 1277.2697 1582.9947
## 3 754.5153 856.2409 923.1968
## 4 804.6320 935.2507 1026.7195
## 5 1023.2543 1002.7834 1183.9751
## 6 603.8745 714.4857 863.0247
<<<<<<< HEAD
## 2024-05-01_rain.sum.lag 2024-06-01_rain.sum.lag 2024-07-01_rain.sum.lag
## 1 821.3999 770.3854 635.6426
## 2 1235.6993 1103.4509 878.0843
## 3 875.1995 673.5048 376.4371
## 4 951.3725 780.1154 449.4857
## 5 1003.6453 792.8721 747.1512
## 6 793.5998 682.0563 478.5240
=======
## 2024-05-01_rain.sum.lag 2024-06-01_rain.sum.lag
## 1 821.3999 770.3854
## 2 1235.6993 1103.4509
## 3 875.1995 673.5048
## 4 951.3725 780.1154
## 5 1003.6453 792.8721
## 6 793.5998 682.0563
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
Here we extract sum of lag rainfall for each row under the column rain.sum.lag
mypts_df <- as.data.frame(mypts)
# Define the function to obtain sum of lag rainfall from corresponding rasters to mypts under rain.sum.lag column (for each row)
# Each extraction has to match the month and year
get_rain_sum_row <- function(current_date, mypts_row) {
# Extract the rainfall value for the current date
rain_sum <- mypts_row[[paste0(current_date, "_rain.sum.lag")]]
return(rain_sum)
}
# Loop through each row and obtain the rainfall sum for each month and year
for (i in 1:nrow(mypts_df)) {
# Extract relevant data for the current row
month <- mypts_df$Month[i]
year <- mypts_df$Year[i]
current_date <- paste0(year, "-", sprintf("%02d", month), "-01") # Format date correctly
# Pass necessary data to the function
rain_sum <- get_rain_sum_row(current_date, mypts_df[i, ])
# Update the rain.sum.lag column
mypts_df$rain.sum.lag[i] <- rain_sum
}
# Update the SpatVector with the new rain.avg column
mypts$rain.sum.lag <- mypts_df$rain.sum.lag
# I'll drop the dates with rain.sum.lag from mypts, seems redundant
column_indices <- grep("^202[0-4]-", names(mypts))
mypts <- mypts[, -column_indices]
names(mypts)
## [1] "Region" "Market" "Month" "Year" "Latitude"
## [6] "Longitude" "Crop" "pkg" "maize" "rice"
## [11] "sorghum" "bmillet" "fmillet" "wheat" "beans"
## [16] "potato" "jan" "feb" "mar" "apr"
## [21] "may" "jun" "jul" "aug" "sep"
## [26] "oct" "nov" "dec" "ttcity_u5" "ttport_1"
## [31] "bio_1" "bio_2" "bio_3" "bio_4" "bio_5"
## [36] "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [41] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15"
## [46] "bio_16" "bio_17" "bio_18" "bio_19" "MAI_ARE"
## [51] "MAI_YLD" "popdens" "latitude" "longitude" "rain.sum.lag"
#define Month as a factor
#mypts$Month <- as.factor(mypts$Month)
#levels(mypts$Month)
#We'll define month as an interger instead.
# Check to make sure Month is interger
sapply(mypts, class)
## Region Market Month Year Latitude Longitude
## "character" "character" "integer" "integer" "numeric" "numeric"
## Crop pkg maize rice sorghum bmillet
## "factor" "numeric" "numeric" "numeric" "numeric" "numeric"
## fmillet wheat beans potato jan feb
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## mar apr may jun jul aug
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## sep oct nov dec ttcity_u5 ttport_1
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_1 bio_2 bio_3 bio_4 bio_5 bio_6
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_19 MAI_ARE MAI_YLD popdens latitude longitude
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## rain.sum.lag
## "numeric"
# drop levels that don't exist in Crop field
mypts$Crop <- mypts$Crop[,drop=TRUE]
levels(mypts$Crop)
## [1] "Maize" "Rice" "Sorghum" "B.Millet" "F.Millet" "Wheat" "Beans"
## [8] "Potato"
Coefficient estimates from the linear model provide a detailed insight into the relationship between each predictor and the response variable.
# Fit the linear model
lm_model <- lm(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = mypts)
summary(lm_model)
##
## Call:
## lm(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet +
## wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 +
## bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 +
## bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 +
## bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude +
## Latitude + rain.sum.lag, data = mypts)
##
## Residuals:
## Min 1Q Median 3Q Max
<<<<<<< HEAD
## -1459.51 -243.70 -13.83 238.66 1620.17
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.233e+05 1.191e+04 -35.529 < 2e-16 ***
## maize -6.523e+01 1.844e+01 -3.537 0.000408 ***
## rice 1.381e+03 1.844e+01 74.904 < 2e-16 ***
## sorghum 3.849e+02 1.950e+01 19.743 < 2e-16 ***
## bmillet 4.723e+02 2.091e+01 22.580 < 2e-16 ***
## fmillet 7.840e+02 1.925e+01 40.735 < 2e-16 ***
## wheat 9.450e+02 2.114e+01 44.707 < 2e-16 ***
## beans 1.518e+03 1.846e+01 82.214 < 2e-16 ***
## potato NA NA NA NA
## Month 4.276e+00 1.898e+00 2.253 0.024292 *
## Year 2.013e+02 5.651e+00 35.622 < 2e-16 ***
## ttcity_u5 1.860e+00 1.595e-01 11.661 < 2e-16 ***
## ttport_1 -1.884e+00 1.697e-01 -11.103 < 2e-16 ***
## bio_1 -2.381e+03 1.913e+02 -12.445 < 2e-16 ***
## bio_2 -2.564e+03 2.390e+02 -10.726 < 2e-16 ***
## bio_3 2.211e+02 1.924e+01 11.493 < 2e-16 ***
## bio_4 2.042e+01 4.048e+00 5.046 4.65e-07 ***
## bio_5 1.899e+03 1.843e+02 10.305 < 2e-16 ***
## bio_6 -1.860e+03 1.952e+02 -9.527 < 2e-16 ***
## bio_7 NA NA NA NA
## bio_8 8.773e+02 9.767e+01 8.982 < 2e-16 ***
## bio_9 -4.409e+01 8.071e+01 -0.546 0.584883
## bio_10 -1.086e+03 2.162e+02 -5.024 5.21e-07 ***
## bio_11 2.628e+03 3.105e+02 8.465 < 2e-16 ***
## bio_12 1.087e+00 2.744e-01 3.963 7.50e-05 ***
## bio_13 2.895e-01 9.237e-01 0.313 0.753959
## bio_14 5.356e+01 1.231e+01 4.352 1.37e-05 ***
## bio_15 2.023e+01 2.103e+00 9.620 < 2e-16 ***
## bio_16 -1.176e+00 7.576e-01 -1.553 0.120493
## bio_17 -1.533e+01 3.553e+00 -4.315 1.62e-05 ***
## bio_18 1.488e-01 2.500e-01 0.595 0.551679
## bio_19 3.072e+00 2.777e+00 1.106 0.268608
## MAI_ARE -1.173e-02 3.274e-02 -0.358 0.720176
## MAI_YLD 1.315e-01 1.745e-02 7.537 5.54e-14 ***
## Longitude 7.918e+01 3.568e+01 2.219 0.026521 *
## Latitude 9.897e-01 3.509e+01 0.028 0.977500
## rain.sum.lag -2.107e-01 1.945e-02 -10.835 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 379 on 5898 degrees of freedom
## Multiple R-squared: 0.7341, Adjusted R-squared: 0.7325
## F-statistic: 478.9 on 34 and 5898 DF, p-value: < 2.2e-16
=======
## -1491.81 -239.93 -10.63 235.20 1602.93
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.592e+05 1.221e+04 -37.593 < 2e-16 ***
## maize -5.132e+01 1.850e+01 -2.774 0.005549 **
## rice 1.397e+03 1.850e+01 75.482 < 2e-16 ***
## sorghum 3.847e+02 1.957e+01 19.660 < 2e-16 ***
## bmillet 4.745e+02 2.103e+01 22.556 < 2e-16 ***
## fmillet 7.905e+02 1.933e+01 40.885 < 2e-16 ***
## wheat 9.527e+02 2.126e+01 44.821 < 2e-16 ***
## beans 1.516e+03 1.852e+01 81.852 < 2e-16 ***
## potato NA NA NA NA
## Month 6.676e+00 1.886e+00 3.540 0.000403 ***
## Year 2.186e+02 5.854e+00 37.343 < 2e-16 ***
## ttcity_u5 1.881e+00 1.588e-01 11.850 < 2e-16 ***
## ttport_1 -1.918e+00 1.685e-01 -11.388 < 2e-16 ***
## bio_1 -2.372e+03 1.891e+02 -12.542 < 2e-16 ***
## bio_2 -2.675e+03 2.351e+02 -11.377 < 2e-16 ***
## bio_3 2.262e+02 1.925e+01 11.747 < 2e-16 ***
## bio_4 2.172e+01 4.038e+00 5.380 7.73e-08 ***
## bio_5 1.992e+03 1.793e+02 11.107 < 2e-16 ***
## bio_6 -1.950e+03 1.923e+02 -10.139 < 2e-16 ***
## bio_7 NA NA NA NA
## bio_8 9.181e+02 9.258e+01 9.917 < 2e-16 ***
## bio_9 -6.182e+01 7.954e+01 -0.777 0.437097
## bio_10 -1.244e+03 2.052e+02 -6.061 1.43e-09 ***
## bio_11 2.754e+03 3.005e+02 9.165 < 2e-16 ***
## bio_12 1.155e+00 2.783e-01 4.150 3.38e-05 ***
## bio_13 -7.293e-02 9.213e-01 -0.079 0.936911
## bio_14 5.699e+01 1.259e+01 4.526 6.14e-06 ***
## bio_15 2.089e+01 2.133e+00 9.796 < 2e-16 ***
## bio_16 -1.014e+00 7.661e-01 -1.323 0.185851
## bio_17 -1.791e+01 3.588e+00 -4.991 6.19e-07 ***
## bio_18 1.459e-01 2.512e-01 0.581 0.561329
## bio_19 4.667e+00 2.700e+00 1.729 0.083933 .
## MAI_ARE -2.628e-02 3.250e-02 -0.808 0.418874
## MAI_YLD 1.432e-01 1.704e-02 8.406 < 2e-16 ***
## Longitude 9.447e+01 3.530e+01 2.676 0.007466 **
## Latitude 1.300e+01 3.535e+01 0.368 0.713134
## rain.sum.lag -2.220e-01 1.938e-02 -11.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 374.6 on 5718 degrees of freedom
## Multiple R-squared: 0.7403, Adjusted R-squared: 0.7388
## F-statistic: 479.5 on 34 and 5718 DF, p-value: < 2.2e-16
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
First check if there are any predictors with NA values
for(column in seq_along(mypts)){
if(any(is.na(mypts[column]))){
print(paste0("Column: ", colnames(mypts)[column], " has at least one NA value"))
}
}
#There are no columns with missing values
Data from May 2021 - Dec 2023 will be used for model training while more recent data from Jan - June 2024 will be used for Validation.
# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
<<<<<<< HEAD
#head(training_data)
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
#head(validation_data)
=======
head(training_data)
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1 Arusha Arusha 5 2021 -3.36968 36.68808 Maize 469.00 1 0
## 2 Dar es Salaam Temeke 5 2021 -6.85097 39.25672 Maize 485.00 1 0
## 3 Dodoma Majengo 5 2021 -6.17971 35.74109 Maize 501.50 1 0
## 4 Dodoma Kibaigwa 5 2021 -6.08110 36.64645 Maize 375.00 1 0
## 5 Kagera Bukoba 5 2021 -1.33140 31.81293 Maize 426.25 1 0
## 6 Manyara Babati 5 2021 -4.21006 35.74915 Maize 375.00 1 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## sep oct nov dec ttcity_u5 ttport_1 bio_1 bio_2 bio_3 bio_4
## 1 0 0 0 0 6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2 0 0 0 0 0.5137705 1692.300 25.93087 8.970697 67.33757 153.60644
## 3 0 0 0 0 11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4 0 0 0 0 83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5 0 0 0 0 17.1729083 2006.820 20.84789 8.869465 83.92802 38.70414
## 6 0 0 0 0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
## bio_5 bio_6 bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375 882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572 582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717 649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068 768.0339
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18 bio_19
## 1 237.8160 7.36089814 92.10601 482.4729 24.95659498 276.6105 32.88688141
## 2 264.3072 28.32696315 76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 133.9309 0.01660535 115.12207 375.4677 0.08605806 284.0681 0.08605806
## 4 129.1152 0.21909479 100.22821 368.5410 2.45863517 320.1588 4.15077049
## 5 346.9308 43.30159629 58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006 0.91639794 89.44063 381.6907 6.74877687 325.6114 12.35086025
## MAI_ARE MAI_YLD popdens latitude longitude rain.sum.lag
## 1 3.107393e+02 1.528850e+02 994.5706 -3.36968 36.68808 541.4019
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097 39.25672 809.3149
## 3 1.283795e-04 9.139631e-03 352.5410 -6.17971 35.74109 637.1213
## 4 1.489781e+03 3.828877e+02 101.0565 -6.08110 36.64645 656.0751
## 5 6.446327e+02 1.834932e+03 280.9803 -1.33140 31.81293 1149.9932
## 6 5.496486e+02 1.917829e+03 219.1393 -4.21006 35.74915 576.1356
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
head(validation_data)
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1 Dar es Salaam Ilala 1 2024 -6.82941 39.26289 Maize 775.0000 1 0
## 2 Dar es Salaam Temeke 1 2024 -6.85097 39.25672 Maize 850.0000 1 0
## 3 Kilimanjaro Moshi 1 2024 -3.34865 37.34352 Maize 850.0000 1 0
## 4 Singida Singida 1 2024 -4.81261 34.74280 Maize 700.0000 1 0
## 5 Arusha Arusha 1 2024 -3.36968 36.68808 Maize 709.0909 1 0
## 6 Dodoma Majengo 1 2024 -6.17971 35.74109 Maize 690.4545 1 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## sep oct nov dec ttcity_u5 ttport_1 bio_1 bio_2 bio_3 bio_4
## 1 0 0 0 0 0.4051143 1689.367 25.92678 8.978264 67.42460 152.9108
## 2 0 0 0 0 0.5137705 1692.300 25.93087 8.970697 67.33757 153.6064
## 3 0 0 0 0 11.3442758 1450.131 22.29721 11.005590 67.75448 180.7284
## 4 0 0 0 0 165.2321068 1606.655 20.38367 11.683626 71.57368 137.7943
## 5 0 0 0 0 6.5704844 1405.327 19.27741 10.934885 68.03365 169.4943
## 6 0 0 0 0 11.0485593 1752.489 22.37704 12.001499 69.67643 153.4637
## bio_5 bio_6 bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 1 32.19589 18.88015 13.31574 26.54073 24.05313 27.71579 24.02143 1122.8900
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 31.30237 15.06427 16.23809 22.88764 20.26380 24.25183 19.85315 938.0520
## 4 28.10475 11.78088 16.32386 21.29394 18.36286 21.69755 18.36286 662.1342
## 5 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375 882.1826
## 6 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572 582.4022
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18 bio_19
## 1 258.0938 28.00268809 75.59542 582.2198 89.31864258 266.4441 103.38645312
## 2 264.3072 28.32696315 76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 277.1083 14.38640622 97.50828 541.7736 53.36366989 211.4703 72.58868003
## 4 140.9090 0.00000000 104.78967 387.3265 0.31403068 192.0734 0.31403068
## 5 237.8160 7.36089814 92.10601 482.4729 24.95659498 276.6105 32.88688141
## 6 133.9309 0.01660535 115.12207 375.4677 0.08605806 284.0681 0.08605806
## MAI_ARE MAI_YLD popdens latitude longitude rain.sum.lag
## 1 3.490943e-03 1.426787e-02 8044.9216 -6.82941 39.26289 1085.9139
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097 39.25672 1088.0394
## 3 1.654091e+03 3.030923e+03 522.6697 -3.34865 37.34352 508.5944
## 4 1.631720e+01 1.052680e+03 254.1591 -4.81261 34.74280 613.3819
## 5 3.107393e+02 1.528850e+02 994.5706 -3.36968 36.68808 435.0957
## 6 1.283795e-04 9.139631e-03 352.5410 -6.17971 35.74109 560.4381
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
The tuneRF function in the randomForest package is used to tune the mtry parameter, which is the number of variables randomly sampled as candidates at each split in the random forest. The function requires a data frame of predictor variables and a response variable.
# Convert training_data data to data frame
mypts_df <- as.data.frame(training_data)
trf <- tuneRF(x=mypts_df[,1:ncol(mypts_df)], # Prediction variables
y=mypts_df$pkg) # Response variable
<<<<<<< HEAD
## mtry = 18 OOB error = 1350.818
## Searching left ...
## mtry = 9 OOB error = 7006.815
## -4.187091 0.05
## Searching right ...
## mtry = 36 OOB error = 155.8048
## 0.8846589 0.05
## mtry = 55 OOB error = 91.25705
## 0.414286 0.05
## mtry = 18 OOB error = 1420.461
## Searching left ...
## mtry = 9 OOB error = 8080.315
## -4.688516 0.05
## Searching right ...
## mtry = 36 OOB error = 136.5892
## 0.9038417 0.05
## mtry = 55 OOB error = 107.578
## 0.2123972 0.05
Based on the output from tuneRF, you can observe that the mtry value that gives the lowest Out-of-Bag (OOB) error. To build the first random forest model, we will use this mtry value.
(mintree <- trf[which.min(trf[,2]),1])
## [1] 55
Here, the model is fitted using the randomForest function to build a predictive model for food commodity prices. The model takes the response variable, the prediction variables and the optimal number of variables to consider at each split. The goal is to generate Variable importance scores which will help us understand which variables have the most significant impact on the response variable, enabling us to interpret the model and possibly simplify it by focusing on the most important predictors.
# Create the random forest model
rf1 <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = training_data,mtry=mintree,importance=TRUE,na.rm=TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf1
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 + bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude + Latitude + rain.sum.lag, data = training_data, mtry = mintree, importance = TRUE, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 36
##
<<<<<<< HEAD
## Mean of squared residuals: 23992.53
## % Var explained: 95.4
varImpPlot(rf1)
## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 154.7101
=======
## Mean of squared residuals: 24098.28
## % Var explained: 95.38
varImpPlot(rf1)
## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 155.1699
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
This calculates the RMSE of the tree in the Random Forest model that has the lowest OOB error.
importance_metrics <- importance(rf1, type=1) # %IncMSE
impvar <- rownames(importance_metrics)[order(importance_metrics[, 1], decreasing=TRUE)]
impvar
## [1] "Year" "Month" "beans" "rice" "Longitude"
<<<<<<< HEAD
## [6] "rain.sum.lag" "fmillet" "wheat" "sorghum" "bmillet"
## [11] "bio_12" "bio_18" "MAI_YLD" "bio_15" "MAI_ARE"
## [16] "bio_3" "ttcity_u5" "Latitude" "ttport_1" "bio_2"
## [21] "bio_8" "bio_4" "bio_1" "bio_14" "bio_5"
## [26] "bio_7" "bio_13" "bio_11" "bio_16" "bio_10"
## [31] "bio_6" "bio_19" "maize" "bio_9" "potato"
=======
## [6] "sorghum" "rain.sum.lag" "wheat" "fmillet" "bmillet"
## [11] "bio_12" "bio_18" "bio_15" "MAI_YLD" "ttcity_u5"
## [16] "Latitude" "MAI_ARE" "bio_3" "ttport_1" "bio_4"
## [21] "bio_1" "bio_8" "bio_2" "bio_7" "bio_13"
## [26] "bio_5" "bio_16" "bio_19" "bio_14" "bio_6"
## [31] "bio_10" "maize" "bio_11" "potato" "bio_9"
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
## [36] "bio_17"
# Get the top 20 variables
top_20_vars <- impvar[1:20]
top_20_vars
## [1] "Year" "Month" "beans" "rice" "Longitude"
<<<<<<< HEAD
## [6] "rain.sum.lag" "fmillet" "wheat" "sorghum" "bmillet"
## [11] "bio_12" "bio_18" "MAI_YLD" "bio_15" "MAI_ARE"
## [16] "bio_3" "ttcity_u5" "Latitude" "ttport_1" "bio_2"
=======
## [6] "sorghum" "rain.sum.lag" "wheat" "fmillet" "bmillet"
## [11] "bio_12" "bio_18" "bio_15" "MAI_YLD" "ttcity_u5"
## [16] "Latitude" "MAI_ARE" "bio_3" "ttport_1" "bio_4"
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
node_purity <- importance(rf1, type=2) # IncNodePurity
# Sort variables by importance (IncNodePurity)
node_purity_sorted <- sort(node_purity[,1], decreasing = TRUE)
node_purity_sorted
## rice beans Year wheat fmillet Month
<<<<<<< HEAD
## 537127817 466842514 380388805 131058156 127560634 114732842
## potato bio_12 maize Longitude rain.sum.lag sorghum
## 49729419 45951002 43414090 43009140 40344444 39020394
## bmillet bio_3 MAI_YLD bio_7 bio_18 ttcity_u5
## 38185514 27679081 24717292 23928361 17424075 15991756
## Latitude MAI_ARE bio_15 ttport_1 bio_6 bio_2
## 14238819 14126778 13630288 12292185 9976545 8630842
## bio_5 bio_4 bio_9 bio_11 bio_8 bio_10
## 7958109 7043691 6035166 5531405 5380741 5294050
## bio_19 bio_1 bio_13 bio_16 bio_14 bio_17
## 5012073 4953527 4828056 4591865 4321322 2834622
=======
## 542294846 475761091 378186164 131333684 130770477 119720507
## potato bio_12 maize rain.sum.lag Longitude sorghum
## 48284677 48043427 42324325 40567947 39033720 38997850
## bmillet bio_3 bio_7 MAI_YLD ttcity_u5 bio_18
## 37845171 28106933 23782771 22171819 20935751 17336321
## Latitude MAI_ARE bio_15 ttport_1 bio_6 bio_2
## 13366783 13338376 13288862 12177421 10042208 9327438
## bio_5 bio_4 bio_9 bio_10 bio_19 bio_8
## 8280274 6942595 6417617 6178000 5506653 5236076
## bio_13 bio_1 bio_11 bio_16 bio_14 bio_17
## 5227136 5169082 4743789 4696699 4618320 3137373
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
# Select the top 20 important variables
top_vars <- names(node_purity_sorted)[1:20]
print(top_vars)
## [1] "rice" "beans" "Year" "wheat" "fmillet"
<<<<<<< HEAD
## [6] "Month" "potato" "bio_12" "maize" "Longitude"
## [11] "rain.sum.lag" "sorghum" "bmillet" "bio_3" "MAI_YLD"
## [16] "bio_7" "bio_18" "ttcity_u5" "Latitude" "MAI_ARE"
rf1$importanceSD
## maize rice sorghum bmillet fmillet wheat
## 1983.76869 2901.76550 571.18768 667.06925 1675.09637 1611.60029
## beans potato Month Year ttcity_u5 ttport_1
## 3025.10953 2056.80364 275.24757 507.18487 260.09925 234.27023
## bio_1 bio_2 bio_3 bio_4 bio_5 bio_6
## 120.24193 228.55221 590.50143 172.02651 196.30755 305.19412
## bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 715.78164 116.03203 290.06653 150.57802 140.41312 823.67181
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 140.32204 99.90244 259.77001 126.00875 136.64634 274.63460
## bio_19 MAI_ARE MAI_YLD Longitude Latitude rain.sum.lag
## 192.57925 259.25070 421.06640 357.49980 337.54338 172.75824
=======
## [6] "Month" "potato" "bio_12" "maize" "rain.sum.lag"
## [11] "Longitude" "sorghum" "bmillet" "bio_3" "bio_7"
## [16] "MAI_YLD" "ttcity_u5" "bio_18" "Latitude" "MAI_ARE"
rf1$importanceSD
## maize rice sorghum bmillet fmillet wheat
## 1903.5260 2751.4873 480.8507 616.1691 1642.3436 1571.3900
## beans potato Month Year ttcity_u5 ttport_1
## 2966.5417 1932.1201 275.9945 532.0094 300.4819 225.4285
## bio_1 bio_2 bio_3 bio_4 bio_5 bio_6
## 112.4326 245.6297 580.2968 147.2850 194.4903 284.8365
## bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 689.6151 111.4434 306.9555 178.1498 154.7698 815.7574
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 128.3945 126.6461 269.3043 125.3841 193.9910 268.8303
## bio_19 MAI_ARE MAI_YLD Longitude Latitude rain.sum.lag
## 186.4857 250.0579 454.9184 340.4519 299.3250 169.0741
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
In this section, we aim to refine our model by selecting the most important variables. We will review the importance metrics (%IncMSE and IncNodePurity) to identify the variables that contribute the most to the model’s predictive power. Our focus will be on variables with higher importance values to ensure a more efficient and interpretable model.
# Estimate more parsimonious specification
rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttport_1 +
bio_3 + bio_6 + bio_9 + bio_12 +
rain.sum.lag,
data=training_data, na.rm=TRUE)
rf
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
<<<<<<< HEAD
## Mean of squared residuals: 30081.9
## % Var explained: 94.23
# evaluate
varImpPlot(rf)
(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 172.4226
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")
# evaluate
varImpPlot(rf)
(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 173.1372
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")
These are prediction plots for each food commodities (maize, beans, rice, sorghum, bmillet, fmillet, wheat, potato) with their respective month of the year 2023.
year <- 2023
Note: we must set the rain.sum.lag variables for each month we are predicting
#Maize
# Create an empty list to store predictions for maize
predict_for_month <- function(month){
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")] # Remember to change depending on year
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_maize <- data.frame(
maize = 1, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_maize, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_maize <- lapply(1:12, predict_for_month)
# Extract pixel values from predictions_maize
maize_values <- unlist(lapply(predictions_maize, values))
# Get min and max values
min_maize <- min(maize_values, na.rm = TRUE)
max_maize <- max(maize_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Create a 3x4 matrix of plots
par(mar = c(0, 0, 0, 0)) # Set margins to 0 for inner plots
for (i in 1:12) {
plot(predictions_maize[[i]], main = paste("Maize prices", toupper(i), year),
zlim = c(min_maize, max_maize), col = color_palette, breaks = seq(min_maize, max_maize, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_maize, max_maize), n = 4)
# Reset plot layout for the legend
layout(matrix(1))
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_maize, max_maize), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Maize Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
# Beans
# Function to predict beans for a given month
predict_for_beans <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_beans <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 1, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_beans, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_beans <- lapply(1:12, predict_for_beans)
# Extract pixel values from predictions_beans
bean_values <- unlist(lapply(predictions_beans, values))
min_bean <- min(bean_values, na.rm = TRUE)
max_bean <- max(bean_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot beans prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_beans[[i]], main = paste("Beans prices", toupper(i), year),
zlim = c(min_bean, max_bean), col = color_palette, breaks = seq(min_bean, max_bean, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bean, max_bean), n = 4)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bean, max_bean), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Beans Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
# Rice
# Function to predict rice prices for a given month
predict_for_rice <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_rice <- data.frame(
maize = 0, rice = 1, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_rice, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_rice <- lapply(1:12, predict_for_rice)
# Extract pixel values from predictions_rice
rice_values <- unlist(lapply(predictions_rice, values))
min_rice <- min(rice_values, na.rm = TRUE)
max_rice <- max(rice_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot rice prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_rice[[i]], main = paste("Rice prices", toupper(i), year),
zlim = c(min_rice, max_rice), col = color_palette, breaks = seq(min_rice, max_rice, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_rice, max_rice), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_rice, max_rice), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Rice Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
# Function to predict sorghum prices for a given month
predict_for_sorghum <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_sorghum <- data.frame(
maize = 0, rice = 0, sorghum = 1, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = 2023
)
pred <- predict(newstack, rf, const = const_sorghum, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_sorghum <- lapply(1:12, predict_for_sorghum)
# Extract pixel values from predictions_sorghum
sorghum_values <- unlist(lapply(predictions_sorghum, values))
min_sorghum <- min(sorghum_values, na.rm = TRUE)
max_sorghum <- max(sorghum_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 300
# Loop through each month to plot sorghum prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_sorghum[[i]], main = paste("Sorghum prices", toupper(i), year),
zlim = c(min_sorghum, max_sorghum), col = color_palette, breaks = seq(min_sorghum, max_sorghum, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_sorghum, max_sorghum), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_sorghum, max_sorghum), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Sorghum Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
# bmillet
# Function to predict bmillet prices for a given month
predict_for_bmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_bmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 1, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_bmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_bmillet <- lapply(1:12, predict_for_bmillet)
# Extract pixel values from predictions_bmillet
bmillet_values <- unlist(lapply(predictions_bmillet, values))
min_bmillet <- min(bmillet_values, na.rm = TRUE)
max_bmillet <- max(bmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot bmillet prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_bmillet[[i]], main = paste("Bmillet prices", toupper(i), year),
zlim = c(min_bmillet, max_bmillet), col = color_palette,
breaks = seq(min_bmillet, max_bmillet, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bmillet, max_bmillet), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bmillet, max_bmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Bulrush Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
#fmillet
# Function to predict fmillet prices for a given month
predict_for_fmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_fmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 1, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_fmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_fmillet <- lapply(1:12, predict_for_fmillet)
# Extract pixel values from predictions_fmillet
fmillet_values <- unlist(lapply(predictions_fmillet, values))
min_fmillet <- min(fmillet_values, na.rm = TRUE)
max_fmillet <- max(fmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot fmillet prices
break_interval <- 300
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_fmillet[[i]], main = paste("Fmillet prices", toupper(i), year),
zlim = c(min_fmillet, max_fmillet), col = color_palette,
breaks = seq(min_fmillet, max_fmillet, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_fmillet, max_fmillet), n = 3)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_fmillet, max_fmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Finger Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
#Wheat
# Function to predict wheat prices for a given month
predict_for_wheat <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_wheat <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 1, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_wheat, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_wheat <- lapply(1:12, predict_for_wheat)
# Extract pixel values from predictions_wheat
wheat_values <- unlist(lapply(predictions_wheat, values))
min_wheat <- min(wheat_values, na.rm = TRUE)
max_wheat <- max(wheat_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Define the break interval for both plot and legend
break_interval <- 100
# Loop through each month to plot wheat prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_wheat[[i]], main = paste("Wheat prices", toupper(i), year),
zlim = c(min_wheat, max_wheat), col = color_palette,
breaks = seq(min_wheat, max_wheat, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_wheat, max_wheat), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_wheat, max_wheat), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Wheat Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
#potatoes
# Function to predict potato prices for a given month
predict_for_potato <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_potato <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 1,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_potato, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_potato <- lapply(1:12, predict_for_potato)
# Extract pixel values from predictions_potato
potato_values <- unlist(lapply(predictions_potato, values))
min_potato <- min(potato_values, na.rm = TRUE)
max_potato <- max(potato_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot potato prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_potato[[i]], main = paste("Potato prices", toupper(i), year),
zlim = c(min_potato, max_potato), col = color_palette,
breaks = seq(min_potato, max_potato, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_potato, max_potato), n = 5) # Adjust n as needed
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_potato, max_potato), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Potato Price (Tsh)", side = 1, line = 2, cex = 0.9))
<<<<<<< HEAD
pred<-predict(object=rf, newdata=validation_data)
actual<-validation_data$pkg
result<-data.frame(actual=actual, predicted=pred)
mse <- mean((actual - pred)^2, na.rm=TRUE)
paste('Mean Squared Error:', mse)
<<<<<<< HEAD
## [1] "Mean Squared Error: 142415.415093055"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error: 176.0013712396"
#Save predicted & observed price
=======
## [1] "Mean Squared Error: 136826.009372833"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error: 176.664380207372"
#Save predicted & observed yield
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
write.csv(result, "result.csv")
#reading result.csv file (predicted vs observed)
rslt <- read.csv("result.csv", header=T)
print(names(rslt))
## [1] "X" "actual" "predicted"
#RMSE predicting from rf - predicited vs observed
rf.rmse<-round(sqrt(mean( (rslt$actual-rslt$predicted)^2 , na.rm = TRUE )),2)
print(rf.rmse)
<<<<<<< HEAD
## [1] 377.38
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,2)
print(rf.r2)
## [1] 0.79
range(actual)
## [1] 350 4050
range(pred)
## [1] 619.2183 3241.6656
=======
## [1] 369.9
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,3)
print(rf.r2)
## [1] 0.8
range(actual)
## [1] 350 4050
range(pred)
## [1] 612.277 3245.126
>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
#plotting predicted Vs observed
ggplot(result, aes(x=actual, y=predicted), alpha=0.6) +
geom_point(colour = "blue", size = 1.4, alpha=0.6) +
ggtitle('Random Forest "Wholesale Grain Prices in Tanzania"') +
scale_x_continuous("Observed Price (Tsh)",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
scale_y_continuous("Predicted Price (Tsh)",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
axis.text.x = element_text(size = 8)) +
geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
annotate("text", x = 300, y = 4500, label = paste("RMSE:", rf.rmse)) +
annotate("text", x = 300, y = 4200, label = paste("R^2: ", rf.r2), parse = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
<<<<<<< HEAD

=======

>>>>>>> 52a8adac6495d75a1fd2a26a43ce6603b2ac564e
library(stats)
mypts_df$pred <- stats::predict(rf)
rsq <- function (obs, pred) cor(obs, pred, use = 'complete.obs') ^ 2
RMSE <- function(obs, pred){sqrt(mean((pred - obs)^2, na.rm = TRUE))}
fr2_rsq <- rsq(mypts_df$pkg, mypts_df$pred) %>% round(digits = 2)
fr2_rmse <- RMSE(mypts_df$pkg, mypts_df$pred) %>% round(digits = 0)
Price_fit_plot <- ggplot(data = mypts_df, aes(x = pkg, y = pred)) +
geom_point(colour = "blue", size = 1.4 ,alpha=0.6) +
ggtitle('Observed vs Predicted "Wholesale Grain Prices in Tanzania"') +
geom_abline(slope = 1, alpha=0.3) +
annotate('text', x = 150, y = 4500, label = paste0("R^{2}==", fr2_rsq), parse = TRUE, size=3) +
annotate('text', x = 150, y = 4200, label = paste0("RMSE==", fr2_rmse), parse = TRUE, size=3) +
labs(x = "Observed Price (Tsh)", y = "Predicted Price (Tsh)") +
xlim(0, 5000) + ylim(0, 5000)
Price_fit_plot
<<<<<<< HEAD
Plot Partial dependence of all the variables used except food commodities and months.
library(caret)
var_importance <- varImp(rf)
impvar <- rownames(var_importance)[order(var_importance[, 1], decreasing=TRUE)]
op <- par(mfrow=c(2, 4))
# exclude food commodities and months
predictors_to_plot <- setdiff(impvar, c("maize", "rice", "sorghum", "bmillet", "fmillet", "wheat", "beans", "potato", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))
for (i in seq_along(predictors_to_plot)) {
partialPlot(rf, as.data.frame(training_data), predictors_to_plot[i], xlab=predictors_to_plot[i],
main="Partial Dependence")
}
<<<<<<< HEAD
# Determine the number of observations for each commodity
#After removing NAs
crop_counts <- table(mypts$Crop)
crop_counts
##
## Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
## 859 858 705 559 731 535 854 832
# Create a data frame with crop and count
crop_summary <- data.frame(
Crop = names(crop_counts),
Count = as.vector(crop_counts)
)
# Print the crop summary
print(crop_summary)
## Crop Count
## 1 Maize 859
## 2 Rice 858
## 3 Sorghum 705
## 4 B.Millet 559
## 5 F.Millet 731
## 6 Wheat 535
## 7 Beans 854
## 8 Potato 832
These are correlation plots for pooled sample in two periods: post-harvest (May-Oct) and lean season (Nov-April).
#We'll use mean Monthly data in wide format
prices.monthly
## Region Market Month Year Latitude Longitude mai.price ric.price
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 469.0000 1530.000
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 485.0000 1650.000
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 501.5000 1592.000
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 375.0000 NaN
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 426.2500 1260.000
## ---
## 860: Mwanza Mwanza 7 2024 -2.51969 32.90144 725.0000 2350.000
## 861: Pwani Pwani 7 2024 -6.44221 38.90622 835.7143 1946.429
## 862: Simiyu Bariadi 7 2024 -2.80355 33.98699 660.0000 1450.000
## 863: Kigoma Kigoma 7 2024 -4.89697 29.65987 578.5714 1417.857
## 864: Rukwa Sumbawanga 7 2024 -7.95239 31.62319 546.7857 1653.571
## sor.price bul.price fin.price whe.price bea.price pot.price
## 1: 695.000 761.000 1333.000 793.0000 1545.000 745.0000
## 2: 900.000 900.000 1775.000 1230.0000 2420.000 730.0000
## 3: 558.000 581.000 1752.000 NaN 2075.000 618.0000
## 4: 437.500 NaN NaN NaN NaN NaN
## 5: 1375.000 1337.500 1637.500 1637.5000 1300.000 675.0000
## ---
## 860: 1428.571 1421.429 1621.429 NaN 2528.571 1142.8571
## 861: 1442.857 1550.000 1857.143 1975.0000 2967.857 1735.7143
## 862: 1400.000 1739.286 1728.571 2457.1429 2750.000 1500.0000
## 863: 1821.429 1892.857 1907.143 1867.8571 2228.571 1075.0000
## 864: NaN NaN 975.000 719.6429 2135.714 641.0714
# Create a function to determine the season
get_season <- function(month) {
if (month %in% 5:10) {
return("Post-Harvest")
} else {
return("Lean Season")
}
}
# Add a 'Season' column to the data
prices.monthly$Season <- sapply(prices.monthly$Month, get_season)
# Post Harvest Data
post_harvest_data <- prices.monthly[prices.monthly$Season == "Post-Harvest", ]
# Remove rows with NaNs from the Post-Harvest data
post_harvest_data <- post_harvest_data[complete.cases(post_harvest_data[, 7:14]), ]
# Lean Season data
lean_season_data <- prices.monthly[prices.monthly$Season == "Lean Season", ]
# Remove rows with NaNs from the Lean Season data
lean_season_data <- lean_season_data[complete.cases(lean_season_data[, 7:14]), ]
# Calculate correlation matrix for Post-Harvest season
post_harvest_corr <- cor(post_harvest_data[, 3:14])
# Calculate correlation matrix for Lean Season
lean_season_corr <- cor(lean_season_data[, 3:14])
# Plot correlation matrix for Post-Harvest season
corrplot(post_harvest_corr,
method = "color",
title = "",
tl.col = "black",
tl.cex = 0.5,
addCoef.col = "black",
number.cex = 0.5,
number.digits = 2)
# Add title to the plot
title(main = "Post-Harvest Correlation Matrix",
line = 4,
cex.main = 0.9)
# Plot correlation matrix for Lean Season
corrplot(lean_season_corr,
method = "color",
title = "",
tl.col = "black",
tl.cex = 0.5,
addCoef.col = "black",
number.cex = 0.5,
number.digits = 2)
# Add title to the plot
title(main = "Lean Season Correlation Matrix",
line = 4,
cex.main = 0.9)
We are doing Crop Specific Price Predictions to determine which model performs better at prediction by comparing predictions r2 or rmse ## Maize Price Prediction
# Filter data for maize
mypts_df2 <- as.data.frame(mypts)
mypts_maize <- mypts_df2[mypts_df2$Crop == "Maize", ]
unique(mypts_maize$Crop)
## [1] Maize
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# Harmonize the levels
mypts_maize <- mypts_maize %>%
mutate(Crop = as.character(Crop)) %>%
mutate(Crop = factor(Crop))
# check again
unique(mypts_maize$Crop)
## [1] Maize
## Levels: Maize
mypts_maize <- vect(mypts_maize, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# Training data for maize
training_data_maize <- mypts_maize[mypts_maize$Year %in% c(2021, 2022, 2023), ]
# Filter the data for validation (Jan 2024 - July 2024)
validation_data_maize <- mypts_maize[mypts_maize$Year == 2024, ]
# Fit the Random Forest Model
rf_maize <- randomForest(pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag,
data = training_data_maize,
na.rm = TRUE)
rf_maize
##
## Call:
## randomForest(formula = pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data_maize, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 12980.05
## % Var explained: 84.36
# evaluate
varImpPlot(rf_maize)
We predict the validation data using the rf model then calculate R^2 and RMSE
pred_maize<-predict(object=rf_maize, newdata=validation_data_maize)
actual_maize<-validation_data_maize$pkg
result_maize<-data.frame(actual=actual_maize, predicted=pred_maize)
#Save predicted & observed yield
write.csv(result_maize, "result_maize.csv")
#reading result_maize.csv file (predicted vs observed)
rslt_m <- read.csv("result_maize.csv", header=T)
print(names(rslt_m))
## [1] "X" "actual" "predicted"
#R-square predicting from rf_maize predicited vs observed
rf_maize.rmse<-round(sqrt(mean( (rslt_m$actual-rslt_m$predicted)^2 , na.rm = TRUE )),2)
print(rf_maize.rmse)
## [1] 325.47
#R-square
rf_maize.r2<-round(summary(lm(actual~predicted, rslt_m))$r.squared,2)
print(rf_maize.r2)
## [1] 0.38
#R-square
rf_maize.r2<-round(summary(lm(actual~predicted, rslt_m))$r.squared,2)
print(rf_maize.r2)
## [1] 0.38
range(actual_maize)
## [1] 350 1250
range(pred_maize, na.rm = TRUE)
## [1] 703.9585 1323.0264
#plotting predicted Vs observed
ggplot(result_maize, aes(x=actual_maize, y=pred_maize), alpha=0.6) +
geom_point(colour = "blue", size = 1.4, alpha=0.6) +
ggtitle('Random Forest "Maize Prices in Tanzania"') +
scale_x_continuous("Observed Price (Tsh)",
limits = c(0, 1500),
breaks = seq(0, 1500, 500)) +
scale_y_continuous("Predicted Price (Tsh)",
limits = c(0, 1500),
breaks = seq(0, 1500, 500)) +
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
axis.text.x = element_text(size = 8)) +
geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
annotate("text", x = 400, y = 1500, label = paste("RMSE:", rf_maize.rmse)) +
annotate("text", x = 400, y = 1400, label = paste("R^2: ", rf_maize.r2), parse = TRUE)
# Filter data for Beans
mypts_beans <- mypts_df2[mypts_df2$Crop == "Beans", ]
unique(mypts_beans$Crop)
## [1] Beans
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# Harmonize the levels
mypts_beans <- mypts_beans %>%
mutate(Crop = as.character(Crop)) %>%
mutate(Crop = factor(Crop))
# check again
unique(mypts_beans$Crop)
## [1] Beans
## Levels: Beans
mypts_beans <- vect(mypts_beans, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# Training data for beans
training_data_beans <- mypts_beans[mypts_beans$Year %in% c(2021, 2022, 2023), ]
# Validation Data beans
validation_data_beans <- mypts_beans[mypts_beans$Year == 2024, ]
# Fit the Random Forest Model
rf_beans <- randomForest(pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag,
data = training_data_beans,
na.rm = TRUE)
rf_beans
##
## Call:
## randomForest(formula = pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data_beans, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 48290.36
## % Var explained: 85.92
# evaluate
varImpPlot(rf_beans)
pred_beans<-predict(object=rf_beans, newdata=validation_data_beans)
actual_beans<-validation_data_beans$pkg
result_beans<-data.frame(actual=actual_beans, predicted=pred_beans)
#Save predicted & observed beans price
write.csv(result_beans, "result_beans.csv")
#reading result_beans.csv file (predicted vs observed)
rslt_b <- read.csv("result_beans.csv", header=T)
print(names(rslt_b))
## [1] "X" "actual" "predicted"
#R-square predicting from rf_beans predicited vs observed
rf_beans.rmse<-round(sqrt(mean( (rslt_b$actual-rslt_b$predicted)^2 , na.rm = TRUE )),2)
print(rf_beans.rmse)
## [1] 348.32
#R-square
rf_beans.r2<-round(summary(lm(actual~predicted, rslt_b))$r.squared,2)
print(rf_beans.r2)
## [1] 0.38
range(actual_beans)
## [1] 1252.5 3850.0
range(pred_beans)
## [1] 1868.940 3287.237
#plotting predicted Vs observed Beans Prices
ggplot(result_beans, aes(x=actual_beans, y=pred_beans), alpha=0.6) +
geom_point(colour = "blue", size = 1.4, alpha=0.6) +
ggtitle('Random Forest "Beans Prices in Tanzania"') +
scale_x_continuous("Observed Price (Tsh)",
limits = c(0, 4000),
breaks = seq(0, 4000, 1000)) +
scale_y_continuous("Predicted Price (Tsh)",
limits = c(0, 4000),
breaks = seq(0, 4000, 1000)) +
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
axis.text.x = element_text(size = 8)) +
geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
annotate("text", x = 1000, y = 3800, label = paste("RMSE:", rf_beans.rmse)) +
annotate("text", x = 1000, y = 3600, label = paste("R^2: ", rf_beans.r2), parse = TRUE)
# Filter data for Rice
mypts_rice <- mypts_df2[mypts_df2$Crop == "Rice", ]
unique(mypts_rice$Crop)
## [1] Rice
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# Harmonize the levels
mypts_rice <- mypts_rice %>%
mutate(Crop = as.character(Crop)) %>%
mutate(Crop = factor(Crop))
# check again
unique(mypts_rice$Crop)
## [1] Rice
## Levels: Rice
mypts_rice <- vect(mypts_rice, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# Training data for rice
training_data_rice <- mypts_rice[mypts_rice$Year %in% c(2021, 2022, 2023), ]
# Validation Data rice
validation_data_rice <- mypts_rice[mypts_rice$Year == 2024, ]
# Fit the Random Forest Model
rf_rice <- randomForest(pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag,
data = training_data_rice,
na.rm = TRUE)
rf_rice
##
## Call:
## randomForest(formula = pkg ~ Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data_rice, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 38862.74
## % Var explained: 89.9
# evaluate
varImpPlot(rf_rice)
pred_rice<-predict(object=rf_rice, newdata=validation_data_rice)
actual_rice<-validation_data_rice$pkg
result_rice<-data.frame(actual=actual_rice, predicted=pred_rice)
#Save predicted & observed rice price
write.csv(result_rice, "result_rice.csv")
#reading result_rice.csv file (predicted vs observed)
rslt_r <- read.csv("result_rice.csv", header=T)
print(names(rslt_r))
## [1] "X" "actual" "predicted"
#R-square predicting from rf_rice predicited vs observed
rf_rice.rmse<-round(sqrt(mean( (rslt_r$actual-rslt_r$predicted)^2 , na.rm = TRUE )),2)
print(rf_rice.rmse)
## [1] 480.29
#R-square
rf_rice.r2<-round(summary(lm(actual~predicted, rslt_r))$r.squared,2)
print(rf_rice.r2)
## [1] 0.29
range(actual_rice)
## [1] 1316.667 3010.000
range(pred_rice)
## [1] 1898.561 3101.155
#plotting predicted Vs observed Rice Prices
ggplot(result_rice, aes(x=actual_rice, y=pred_rice), alpha=0.6) +
geom_point(colour = "blue", size = 1.4, alpha=0.6) +
ggtitle('Random Forest "Rice Prices in Tanzania"') +
scale_x_continuous("Observed Price (Tsh)",
limits = c(0, 4000),
breaks = seq(0, 4000, 1000)) +
scale_y_continuous("Predicted Price (Tsh)",
limits = c(0, 4000),
breaks = seq(0, 4000, 1000)) +
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
axis.text.x = element_text(size = 8)) +
geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
annotate("text", x = 1000, y = 3800, label = paste("RMSE:", rf_rice.rmse)) +
annotate("text", x = 1000, y = 3600, label = paste("R^2: ", rf_rice.r2), parse = TRUE)
We are interested in estimating the prices of agricultural food commodities across space and time, on the basis of prices as observed at some market locations. Here we explore methods to model these prices, using monthly data from across Tanzania over the period May 2021-June 2024.
This document describes the process of fitting a Random Forest model to predict these prices. The performance of the Random Forest model is evaluated using Root Mean Square Errors (RMSE) and R-Square as test statistics. We also compare the observed prices with the predicted prices using an out of sample dataset.
library(geodata)
## Warning: package 'geodata' was built under R version 4.3.3
library(lubridate)
library(terra)
library(data.table)
library(randomForest)
library(httr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
library(pdp)
## Warning: package 'pdp' was built under R version 4.3.3
library(gridExtra)
library(stats)
library(dplyr)
library(stringr)
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Warning: package 'spam' was built under R version 4.3.3
This dataset contains price information for various crops across different regions and markets in Tanzania from 2021 to 2024. The data was acquired from Tanzania’s Ministry of Industry and Trade.
setwd("H:/Tanzania Price data/Datasets")
prices <- fread("Tanzania_Price_Data_AllCrops_with_Coordinates4.csv")
dim(prices)
## [1] 7441 21
head(prices)
## Region Market Maize..min.price. Maize..max.price. Rice..min.price.
## 1: Arusha Arusha 43000 45000 140000
## 2: Dar es Salaam Temeke 46000 47000 120000
## 3: Dodoma Majengo 45000 50000 132000
## 4: Dodoma Kibaigwa 30000 42000 NA
## 5: Kagera Bukoba 33000 35000 110000
## 6: Manyara Babati 30000 45000 120000
## Rice..max.price. Sorghum..min.price. Sorghum..max.price.
## 1: 170000 60000 65000
## 2: 210000 80000 100000
## 3: 200000 50000 60000
## 4: NA 45000 48000
## 5: 140000 140000 150000
## 6: 170000 60000 80000
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## 1: 70000 75000
## 2: 80000 100000
## 3: 52000 64000
## 4: NA NA
## 5: 120000 140000
## 6: 80000 100000
## Finger.Millet..min.price. Finger.Millet..max.price. Wheat..min.price.
## 1: 120000 125000 70000
## 2: 175000 180000 110000
## 3: 200000 252000 NA
## 4: NA NA NA
## 5: 170000 180000 170000
## 6: 120000 130000 100000
## Wheat..max.price. Beans..min.price. Beans..max.price.
## 1: 78000 130000 165000
## 2: 120000 220000 260000
## 3: NA 200000 245000
## 4: NA NA NA
## 5: 180000 110000 170000
## 6: 120000 150000 180000
## Irish.Potatoes..min.price. Irish.Potatoes..max.price. Date Latitude
## 1: 70000 75000 5/5/2021 -3.36968
## 2: 75000 75000 5/5/2021 -6.85097
## 3: 56000 68000 5/5/2021 -6.17971
## 4: NA NA 5/5/2021 -6.08110
## 5: 60000 75000 5/5/2021 -1.33140
## 6: 60000 60000 5/5/2021 -4.21006
## Longitude
## 1: 36.68808
## 2: 39.25672
## 3: 35.74109
## 4: 36.64645
## 5: 31.81293
## 6: 35.74915
table(prices$Market)
##
## Arusha Babati Bariadi Buguruni Bukoba Geita
## 371 227 64 10 373 38
## Igawilo Ilala Iringa Kibaigwa Kigoma Kilimanjaro
## 24 187 375 208 227 13
## Kilombero Kinondoni Lindi Majengo Manyara Mbeya
## 24 202 232 408 115 117
## Mgandini Mnalani Morogoro Moshi Mpanda Mpimbwe
## 24 9 369 225 193 39
## Mtwara Musoma Mwananyamala Mwanza Namfua Njombe
## 384 270 24 333 24 172
## Nyankumbu Pwani Shinyanga Singida Songea Songwe
## 24 55 266 83 344 34
## Sumbawanga Tabora Tandale Tandika Tanga Temeke
## 368 347 41 50 359 153
## Ubungo Ujiji
## 32 4
sapply(prices, class)
## Region Market
## "character" "character"
## Maize..min.price. Maize..max.price.
## "integer" "integer"
## Rice..min.price. Rice..max.price.
## "integer" "integer"
## Sorghum..min.price. Sorghum..max.price.
## "integer" "integer"
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## "integer" "integer"
## Finger.Millet..min.price. Finger.Millet..max.price.
## "integer" "integer"
## Wheat..min.price. Wheat..max.price.
## "integer" "integer"
## Beans..min.price. Beans..max.price.
## "integer" "integer"
## Irish.Potatoes..min.price. Irish.Potatoes..max.price.
## "integer" "integer"
## Date Latitude
## "character" "numeric"
## Longitude
## "numeric"
# Convert to date format
prices$Date <- lubridate::mdy(prices$Date)
Check the Region and Market names and coodinates. Make sure the Region and Market names are harmonized and properly geocoded
unique(prices[Region=="Arusha",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Arusha -3.36968 36.68808
## 2: Kilombero -3.37324 36.67870
unique(prices[Region=="Dar es Salaam",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Temeke -6.850970 39.25672
## 2: Kinondoni -6.784070 39.27007
## 3: Ilala -6.829410 39.26289
## 4: Tandika -6.867912 39.25480
## 5: Buguruni -6.838380 39.24359
## 6: Tandale -6.795230 39.24085
## 7: Ubungo -6.793620 39.20966
## 8: Mwananyamala -6.788890 39.25840
unique(prices[Region=="Dodoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Majengo -6.17971 35.74109
## 2: Kibaigwa -6.08110 36.64645
unique(prices[Region=="Kagera",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bukoba -1.3314 31.81293
unique(prices[Region=="Manyara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Babati -4.21006 35.74915
## 2: Manyara -4.46011 37.20217
unique(prices[Region=="Rukwa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Sumbawanga -7.95239 31.62319
unique(prices[Region=="Mpanda",.(Market, Latitude, Longitude)])
## Empty data.table (0 rows and 3 cols): Market,Latitude,Longitude
unique(prices[Region=="Mtwara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mtwara -10.28009 40.18191
unique(prices[Region=="Tabora",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tabora -5.01972 32.80767
unique(prices[Region=="Tanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tanga -5.074260 39.09993
## 2: Mgandini -5.086235 39.09561
unique(prices[Region=="Iringa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Iringa -7.78001 35.69671
unique(prices[Region=="Kigoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Kigoma -4.89697 29.65987
## 2: Ujiji -4.90837 29.69203
unique(prices[Region=="Morogoro",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Morogoro -6.82771 37.66542
unique(prices[Region=="Mwanza",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mwanza -2.51969 32.90144
unique(prices[Region=="Mara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Musoma -1.49982 33.8083
unique(prices[Region=="Ruvuma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songea -10.67873 35.64836
unique(prices[Region=="Shinyanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Shinyanga -3.67226 33.43069
unique(prices[Region=="Kilimanjaro",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Moshi -3.34865 37.34352
## 2: Kilimanjaro -3.33854 37.32654
unique(prices[Region=="Mbeya",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mbeya -8.909940 33.45517
## 2: Igawilo -8.924246 33.56788
unique(prices[Region=="Katavi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mpanda -6.34295 31.07299
## 2: Mpimbwe -7.24425 31.81782
## 3: Majengo -6.34809 31.07055
unique(prices[Region=="Njombe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Njombe -9.33805 34.76949
unique(prices[Region=="Lindi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Lindi -9.995 39.708
unique(prices[Region=="Singida",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Singida -4.812610 34.7428
## 2: Namfua -4.821121 34.7470
unique(prices[Region=="Pwani",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Pwani -6.44221 38.90622
## 2: Mnalani -6.44221 38.90622
unique(prices[Region=="Simiyu",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bariadi -2.80355 33.98699
unique(prices[Region=="Geita",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Geita -2.870760 32.23408
## 2: Nyankumbu -2.895955 32.22871
unique(prices[Region=="Songwe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songwe -8.95179 33.24377
setnames(prices, old = "Maize..min.price.", new = "mai.price.min")
setnames(prices, old = "Rice..min.price.", new = "ric.price.min")
setnames(prices, old = "Sorghum..min.price.", new = "sor.price.min")
setnames(prices, old = "Bulrush.Millet..min.price.", new = "bul.price.min")
setnames(prices, old = "Finger.Millet..min.price.", new = "fin.price.min")
setnames(prices, old = "Wheat..min.price.", new = "whe.price.min")
setnames(prices, old = "Beans..min.price.", new = "bea.price.min")
setnames(prices, old = "Irish.Potatoes..min.price.", new = "pot.price.min")
setnames(prices, old = "Maize..max.price.", new = "mai.price.max")
setnames(prices, old = "Rice..max.price.", new = "ric.price.max")
setnames(prices, old = "Sorghum..max.price.", new = "sor.price.max")
setnames(prices, old = "Bulrush.Millet..max.price.", new = "bul.price.max")
setnames(prices, old = "Finger.Millet..max.price.", new = "fin.price.max")
setnames(prices, old = "Wheat..max.price.", new = "whe.price.max")
setnames(prices, old = "Beans..max.price.", new = "bea.price.max")
setnames(prices, old = "Irish.Potatoes..max.price.", new = "pot.price.max")
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "integer" "integer" "integer"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "integer" "integer" "integer" "integer" "integer"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "integer" "integer" "integer" "integer" "integer"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "integer" "integer" "integer" "Date" "numeric"
## Longitude
## "numeric"
#convert prices to numeric
prices$mai.price.min <- as.numeric(prices$mai.price.min)
prices$ric.price.min <- as.numeric(prices$ric.price.min)
prices$sor.price.min <- as.numeric(prices$sor.price.min)
prices$bul.price.min <- as.numeric(prices$bul.price.min)
prices$fin.price.min <- as.numeric(prices$fin.price.min)
prices$whe.price.min <- as.numeric(prices$whe.price.min)
prices$bea.price.min <- as.numeric(prices$bea.price.min)
prices$pot.price.min <- as.numeric(prices$pot.price.min)
prices$mai.price.max <- as.numeric(prices$mai.price.max)
prices$ric.price.max <- as.numeric(prices$ric.price.max)
prices$sor.price.max <- as.numeric(prices$sor.price.max)
prices$bul.price.max <- as.numeric(prices$bul.price.max)
prices$fin.price.max <- as.numeric(prices$fin.price.max)
prices$whe.price.max <- as.numeric(prices$whe.price.max)
prices$bea.price.max <- as.numeric(prices$bea.price.max)
prices$pot.price.max <- as.numeric(prices$pot.price.max)
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "numeric" "numeric" "numeric"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "numeric" "numeric" "numeric" "numeric" "numeric"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "numeric" "numeric" "numeric" "numeric" "numeric"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "numeric" "numeric" "numeric" "Date" "numeric"
## Longitude
## "numeric"
# convert to price per kg
prices$mai.price.min <- prices$mai.price.min/100
prices$ric.price.min <- prices$ric.price.min/100
prices$sor.price.min <- prices$sor.price.min/100
prices$bul.price.min <- prices$bul.price.min/100
prices$fin.price.min <- prices$fin.price.min/100
prices$whe.price.min <- prices$whe.price.min/100
prices$bea.price.min <- prices$bea.price.min/100
prices$pot.price.min <- prices$pot.price.min/100
prices$mai.price.max <- prices$mai.price.max/100
prices$ric.price.max <- prices$ric.price.max/100
prices$sor.price.max <- prices$sor.price.max/100
prices$bul.price.max <- prices$bul.price.max/100
prices$fin.price.max <- prices$fin.price.max/100
prices$whe.price.max <- prices$whe.price.max/100
prices$bea.price.max <- prices$bea.price.max/100
prices$pot.price.max <- prices$pot.price.max/100
# calculate average of min and max
prices$mai.price <- (prices$mai.price.min + prices$mai.price.max) / 2
prices$ric.price <- (prices$ric.price.min + prices$ric.price.max) / 2
prices$sor.price <- (prices$sor.price.min + prices$sor.price.max) / 2
prices$bul.price <- (prices$bul.price.min + prices$bul.price.max) / 2
prices$fin.price <- (prices$fin.price.min + prices$fin.price.max) / 2
prices$whe.price <- (prices$whe.price.min + prices$whe.price.max) / 2
prices$bea.price <- (prices$bea.price.min + prices$bea.price.max) / 2
prices$pot.price <- (prices$pot.price.min + prices$pot.price.max) / 2
#We can add dates by using the year and the month names
prices$Day <- day(prices$Date)
prices$Month <- month(prices$Date)
prices$Year <- year(prices$Date)
# drop unneccessary columns
prices <- prices[,!c("mai.price.min", "mai.price.max",
"ric.price.min", "ric.price.max",
"sor.price.min", "sor.price.max",
"bul.price.min", "bul.price.max",
"fin.price.min", "fin.price.max",
"whe.price.min", "whe.price.max",
"bea.price.min", "bea.price.max",
"pot.price.min", "pot.price.max")]
# calculate monthly mean prices by market
prices.monthly <- prices[, .(mai.price = mean(mai.price, na.rm = TRUE),
ric.price = mean(ric.price, na.rm = TRUE),
sor.price = mean(sor.price, na.rm = TRUE),
bul.price = mean(bul.price, na.rm = TRUE),
fin.price = mean(fin.price, na.rm = TRUE),
whe.price = mean(whe.price, na.rm = TRUE),
bea.price = mean(bea.price, na.rm = TRUE),
pot.price = mean(pot.price, na.rm = TRUE)),
by=.(Region, Market, Month, Year, Latitude, Longitude)]
# reshape to long (so that prices for different commodities can be simultaneously estimated)
prices.monthly
## Region Market Month Year Latitude Longitude mai.price ric.price
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 469.0000 1530.000
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 485.0000 1650.000
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 501.5000 1592.000
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 375.0000 NaN
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 426.2500 1260.000
## ---
## 836: Mwanza Mwanza 6 2024 -2.51969 32.90144 725.0000 2350.000
## 837: Pwani Mnalani 6 2024 -6.44221 38.90622 772.2222 2000.000
## 838: Simiyu Bariadi 6 2024 -2.80355 33.98699 563.3333 1316.667
## 839: Kigoma Kigoma 6 2024 -4.89697 29.65987 504.2222 1488.889
## 840: Rukwa Sumbawanga 6 2024 -7.95239 31.62319 533.3333 1750.000
## sor.price bul.price fin.price whe.price bea.price pot.price
## 1: 695.000 761.000 1333.000 793.0000 1545.000 745.0000
## 2: 900.000 900.000 1775.000 1230.0000 2420.000 730.0000
## 3: 558.000 581.000 1752.000 NaN 2075.000 618.0000
## 4: 437.500 NaN NaN NaN NaN NaN
## 5: 1375.000 1337.500 1637.500 1637.5000 1300.000 675.0000
## ---
## 836: 1400.000 1350.000 1550.000 NaN 2400.000 1000.0000
## 837: 1522.222 1544.444 2022.222 2400.0000 3116.667 1800.0000
## 838: 1300.000 1788.889 1716.667 2316.6667 2805.556 1500.0000
## 839: 1266.667 1266.667 1433.333 2122.2222 2216.667 1127.7778
## 840: NaN NaN 1088.889 780.5556 1888.889 652.7778
prices.monthly.long <- melt(prices.monthly, id.vars=c('Region', 'Market', 'Month', 'Year', 'Latitude', 'Longitude'),)
# rename columns
setnames(prices.monthly.long, old="variable", new="Crop")
setnames(prices.monthly.long, old="value", new="pkg")
# replace crop names
prices.monthly.long[Crop == "mai.price", Crop := "Maize"]
prices.monthly.long[Crop == "ric.price", Crop := "Rice"]
prices.monthly.long[Crop == "sor.price", Crop := "Sorghum"]
prices.monthly.long[Crop == "bul.price", Crop := "B.Millet"]
prices.monthly.long[Crop == "fin.price", Crop := "F.Millet"]
prices.monthly.long[Crop == "whe.price", Crop := "Wheat"]
prices.monthly.long[Crop == "bea.price", Crop := "Beans"]
prices.monthly.long[Crop == "pot.price", Crop := "Potato"]
# Reset the factor levels to updated levels
prices.monthly.long[, Crop := factor(Crop)]
# Check the unique values again
unique(prices.monthly.long$Crop)
## [1] Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# generate dummies to use in place of factors (for later spatial predictions, which are struggling with factors)
prices.monthly.long[, maize := ifelse(Crop == "Maize",1,0)]
prices.monthly.long[, rice := ifelse(Crop == "Rice",1,0)]
prices.monthly.long[, sorghum := ifelse(Crop == "Sorghum",1,0)]
prices.monthly.long[, bmillet := ifelse(Crop == "B.Millet",1,0)]
prices.monthly.long[, fmillet := ifelse(Crop == "F.Millet",1,0)]
prices.monthly.long[, wheat := ifelse(Crop == "Wheat",1,0)]
prices.monthly.long[, beans := ifelse(Crop == "Beans",1,0)]
prices.monthly.long[, potato := ifelse(Crop == "Potato",1,0)]
prices.monthly.long[, jan := ifelse(Month == 1 , 1, 0)]
prices.monthly.long[, feb := ifelse(Month == 2 , 1, 0)]
prices.monthly.long[, mar := ifelse(Month == 3 , 1, 0)]
prices.monthly.long[, apr := ifelse(Month == 4 , 1, 0)]
prices.monthly.long[, may := ifelse(Month == 5 , 1, 0)]
prices.monthly.long[, jun := ifelse(Month == 6 , 1, 0)]
prices.monthly.long[, jul := ifelse(Month == 7 , 1, 0)]
prices.monthly.long[, aug := ifelse(Month == 8 , 1, 0)]
prices.monthly.long[, sep := ifelse(Month == 9 , 1, 0)]
prices.monthly.long[, oct := ifelse(Month == 10 , 1, 0)]
prices.monthly.long[, nov := ifelse(Month == 11 , 1, 0)]
prices.monthly.long[, dec := ifelse(Month == 12 , 1, 0)]
# replace NaN with NAs in the price observations
prices.monthly.long[is.nan(pkg), pkg := NA]
# Remove observations with missing observations
prices.monthly.long <- na.omit(prices.monthly.long)
# bring in raster stack as predictors
geodata_path("H:/Tanzania Price data/Datasets/geodata")
list.files("H:/Tanzania Price data/Datasets/geodata", recursive=TRUE)
## [1] "climate/wc2.1_country/TZA_wc2.1_30s_bio.tif"
## [2] "travel/travel_time_to_cities_u5.tif"
## [3] "TRUE/gadm/gadm41_TZA_0_pk.rds"
## [4] "TRUE/gadm/gadm41_TZA_1_pk.rds"
## [5] "TRUE/gadm/gadm41_TZA_2_pk.rds"
## [6] "TRUE/gadm/gadm41_TZA_3_pk.rds"
## [7] "TRUE/spam/spam2017v2r1_ssa_area.zip"
## [8] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif"
## [9] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_H.tif"
## [10] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_I.tif"
## [11] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_L.tif"
## [12] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_R.tif"
## [13] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_S.tif"
## [14] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_A.tif"
## [15] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_H.tif"
## [16] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_I.tif"
## [17] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_L.tif"
## [18] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif"
## [19] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_S.tif"
## [20] "TRUE/spam/spam2017v2r1_ssa_yield.zip"
## [21] "TRUE/travel/travel_time_to_cities_u5.tif"
## [22] "TRUE/travel/travel_time_to_ports_1.tif"
## [23] "TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif"
# tza0 <- gadm(country="TZA", level=0)
# tza1 <- gadm(country="TZA", level=1)
# tza2 <- gadm(country="TZA", level=2)
# tza3 <- gadm(country="TZA", level=3)
tza0 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_0_pk.rds")
tza1 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_1_pk.rds")
tza2 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_2_pk.rds")
tza3 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_3_pk.rds")
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# see if these show up correctly
plot(tza1)
plot(mypts, col="Red", add=TRUE)
# text(mypts, label="Market")
# create reference raster
tza_extent <- ext(tza1) |> floor()
r <- crop(rast(res=1/12), tza_extent)
## Interpolate
#xy <- as.matrix(mypts[,c("Longitude", "Latitude")])
xy <- geom(mypts)[,c("y","x")]
#tps <- Tps(xy, p$spatial)
tps <- Tps(xy, mypts$pkg)
sp <- interpolate(r, tps)
sp <- mask(sp, tza1)
plot(sp)
lines(tza1)
rf <- randomForest(pkg ~ Longitude + Latitude , data=mypts)
sp3 <- interpolate(r, rf, xyNames=c("Longitude", "Latitude"))
sp3 <- mask(sp3, tza1)
plot(sp3)
lines(tza1)
The covariates used include a mix of crop-specific indicators, temporal variables to capture monthly and yearly effects, geographical coordinates, accessibility measures, climatic conditions, and lagged rainfall to account for delayed effects of weather on crop prices.
ttcity <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_cities_u5.tif")
ttport <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_ports_1.tif")
clm <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif")
area <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif")
yield <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif")
popd <- rast("gpw_v4_population_density_rev11_2020_10m.tif")
names(ttcity) <- c("ttcity_u5") ## travel time cities of 100k or more
names(ttport) <- c("ttport_1") ## travel time to major ports
names(clm) <- gsub("wc2.1_30s_", "", names(clm))
names(area) <- c("MAI_ARE") # SPAM maize area 2010
names(yield) <- c("MAI_YLD") # SPAM maize yield 2010
names(popd) <- c("popdens") # GPW4
comment(ttcity) <- "travel time to cities 100k or more"
comment(ttport) <- "travel time to major ports"
comment(popd) <- "population density 2020 (GPW4 @ 10dm)"
comment(clm)[1] <-"BIO1 = Annual Mean Temperature"
comment(clm)[2] <-"BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))"
comment(clm)[3] <-"BIO3 = Isothermality (BIO2/BIO7) (×100)"
comment(clm)[4] <-"BIO4 = Temperature Seasonality (standard deviation ×100)"
comment(clm)[5] <-"BIO5 = Max Temperature of Warmest Month"
comment(clm)[6] <-"BIO6 = Min Temperature of Coldest Month"
comment(clm)[7] <-"BIO7 = Temperature Annual Range (BIO5-BIO6)"
comment(clm)[8] <-"BIO8 = Mean Temperature of Wettest Quarter"
comment(clm)[9] <-"BIO9 = Mean Temperature of Driest Quarter"
comment(clm)[10] <-"BIO10 = Mean Temperature of Warmest Quarter"
comment(clm)[11] <-"BIO11 = Mean Temperature of Coldest Quarter"
comment(clm)[12] <-"BIO12 = Annual Precipitation"
comment(clm)[13] <-"BIO13 = Precipitation of Wettest Month"
comment(clm)[14] <-"BIO14 = Precipitation of Driest Month"
comment(clm)[15] <-"BIO15 = Precipitation Seasonality (Coefficient of Variation)"
comment(clm)[16] <-"BIO16 = Precipitation of Wettest Quarter"
comment(clm)[17] <-"BIO17 = Precipitation of Driest Quarter"
comment(clm)[18] <-"BIO18 = Precipitation of Warmest Quarter"
comment(clm)[19] <-"BIO19 = Precipitation of Coldest Quarter"
ttcity <- resample(ttcity, r)
ttport <- resample(ttport, r)
clm <- resample(clm, r)
area <- resample(area, r)
popd <- resample(popd, r)
freq(is.na(area))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
area <- classify(area, cbind(NA,0))
yield <- resample(yield, r)
freq(is.na(yield))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
yield <- classify(yield, cbind(NA,0))
# check again
compareGeom(ttcity, ttport, clm, area, yield, popd)
## [1] TRUE
latgrd <- longrd <- r
latgrd[] <- yFromCell(latgrd, 1:ncell(latgrd))
longrd[] <- xFromCell(longrd, 1:ncell(longrd))
names(latgrd) <- c("latitude")
names(longrd) <- c("longitude")
rstack <- c(ttcity, ttport, clm, area, yield, popd, latgrd, longrd)
names(rstack)
## [1] "ttcity_u5" "ttport_1" "bio_1" "bio_2" "bio_3" "bio_4"
## [7] "bio_5" "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [13] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15" "bio_16"
## [19] "bio_17" "bio_18" "bio_19" "MAI_ARE" "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
# create focal mean to extract from (as alternative to using buffers for extraction, which are not supported in terra)
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack, w=fm, fun="mean", na.policy="all", fillvalue=NA, # na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
# extract values to dataset -- use a 20km buffer
# do a focal sum of 20km radius - this is about 0.18 of a decimal degree... 0.18*112=20.16
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack2, w=fm, fun="sum", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
chirps_path <- "H:/Tanzania Price data/chirps_data"
chirps_files <- list.files(chirps_path, pattern = ".tif$", full.names = TRUE)
# Read all CHIRPS data files into a SpatRaster collection
chirps_rasters <- rast(chirps_files)
#crop to Tanzania boundary
Chirps_Tz <- crop(chirps_rasters, tza1)
writeRaster(Chirps_Tz, "Tz_chirps_monthly_croped.tif", overwrite=TRUE)
Tz_chirps_monthly <- terra::rast("Tz_chirps_monthly_croped.tif")
Tz_chirps_monthly
## class : SpatRaster
## dimensions : 215, 222, 44 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source : Tz_chirps_monthly_croped.tif
## names : chirp~20.11, chirp~20.12, chirp~21.01, chirp~21.02, chirp~21.03, chirp~21.04, ...
## min values : -9999.0000, -9999.0000, -9999.000, -9999.0000, -9999.0000, -9999.0000, ...
## max values : 459.7112, 504.3329, 508.448, 585.3241, 750.1949, 960.3376, ...
#Replace -9999 with NA
Tz_chirps_monthly <- classify(Tz_chirps_monthly, cbind(-9999,NA))
#extract layer names
layer_names <- names(Tz_chirps_monthly)
layer_names
## [1] "chirps-v2.0.2020.11" "chirps-v2.0.2020.12" "chirps-v2.0.2021.01"
## [4] "chirps-v2.0.2021.02" "chirps-v2.0.2021.03" "chirps-v2.0.2021.04"
## [7] "chirps-v2.0.2021.05" "chirps-v2.0.2021.06" "chirps-v2.0.2021.07"
## [10] "chirps-v2.0.2021.08" "chirps-v2.0.2021.09" "chirps-v2.0.2021.10"
## [13] "chirps-v2.0.2021.11" "chirps-v2.0.2021.12" "chirps-v2.0.2022.01"
## [16] "chirps-v2.0.2022.02" "chirps-v2.0.2022.03" "chirps-v2.0.2022.04"
## [19] "chirps-v2.0.2022.05" "chirps-v2.0.2022.06" "chirps-v2.0.2022.07"
## [22] "chirps-v2.0.2022.08" "chirps-v2.0.2022.09" "chirps-v2.0.2022.10"
## [25] "chirps-v2.0.2022.11" "chirps-v2.0.2022.12" "chirps-v2.0.2023.01"
## [28] "chirps-v2.0.2023.02" "chirps-v2.0.2023.03" "chirps-v2.0.2023.04"
## [31] "chirps-v2.0.2023.05" "chirps-v2.0.2023.06" "chirps-v2.0.2023.07"
## [34] "chirps-v2.0.2023.08" "chirps-v2.0.2023.09" "chirps-v2.0.2023.10"
## [37] "chirps-v2.0.2023.11" "chirps-v2.0.2023.12" "chirps-v2.0.2024.01"
## [40] "chirps-v2.0.2024.02" "chirps-v2.0.2024.03" "chirps-v2.0.2024.04"
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06"
# We need to create a sequence of dates from the layer names
# Extract year and month from layer names and convert to Date
dates <- as.Date(paste0(sub("chirps-v2.0\\.", "", layer_names), "-01"), format = "%Y.%m-%d")
dates
## [1] "2020-11-01" "2020-12-01" "2021-01-01" "2021-02-01" "2021-03-01"
## [6] "2021-04-01" "2021-05-01" "2021-06-01" "2021-07-01" "2021-08-01"
## [11] "2021-09-01" "2021-10-01" "2021-11-01" "2021-12-01" "2022-01-01"
## [16] "2022-02-01" "2022-03-01" "2022-04-01" "2022-05-01" "2022-06-01"
## [21] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [26] "2022-12-01" "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01"
## [31] "2023-05-01" "2023-06-01" "2023-07-01" "2023-08-01" "2023-09-01"
## [36] "2023-10-01" "2023-11-01" "2023-12-01" "2024-01-01" "2024-02-01"
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01"
# Assign these dates to the SpatRaster object
time(Tz_chirps_monthly) <- dates
#rename the layers to the formatted dates
names(Tz_chirps_monthly) <- dates
# Check the SpatRaster object
print(Tz_chirps_monthly)
## class : SpatRaster
## dimensions : 215, 222, 44 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 5.859746, 1.903993, 0.8161677, 0.3187093, 0.9539621, 1.17571, ...
## max values : 459.711212, 504.332947, 508.4479980, 585.3240967, 750.1949463, 960.33765, ...
## time (days) : 2020-11-01 to 2024-06-01
# do a focal mean of 100km radius - this is about 0.9 of a decimal degree... 0.9009*112=100.9008
# Calculate the focal mean for each layer (month)
fm_r <- focalMat(Tz_chirps_monthly, d=0.9, type='circle', fillNA=FALSE)
Rainfall_focal_sum_100km <- focal(Tz_chirps_monthly, w=fm_r, fun="mean", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE)
# Check the result
Rainfall_focal_sum_100km
## class : SpatRaster
## dimensions : 215, 222, 44 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.79649, 27.27837, 8.092316, 4.489223, 11.30184, 15.74836, ...
## max values : 313.14248, 326.88227, 416.590468, 395.422070, 464.67840, 569.81990, ...
## time (days) : 2020-11-01 to 2024-06-01
Rainfall <- Rainfall_focal_sum_100km
#Resample
Rainfall_res <- resample(Rainfall, r)
Rainfall_res
## class : SpatRaster
## dimensions : 144, 144, 44 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.89972, 32.9472, 8.116516, 4.527868, 11.83634, 15.90436, ...
## max values : 310.41034, 326.4339, 416.389160, 395.367279, 462.19266, 550.26270, ...
## time (days) : 2020-11-01 to 2024-06-01
Here we define function that calculates 6 month lag sum of rainfall for each month in our data. The output is raster datsets.
calculate_lagged_sum <- function(raster_stack, num_months = 6) {
# Get the time vector from the raster stack
time_vector <- time(raster_stack)
# Initialize list to store lagged sum rasters
lagged_sum_rasters <- vector("list", length(time_vector))
# Loop through each layer in the raster stack
for (i in seq_along(time_vector)) {
if (i > num_months) { # We need at least 'num_months' previous layers to calculate the lagged sum
# Determine the start and end dates for the lag period
end_date <- time_vector[i] # Date of the current layer being processed
start_date <- end_date %m-% months(num_months) #The date num_months before the end_date
# Select the layers that fall within the lag period
lag_period_layers <- raster_stack[[which(time_vector > start_date & time_vector <= end_date)]]
# Calculate the sum of the selected layers
if (nlyr(lag_period_layers) == num_months) {
lagged_sum_rasters[[i]] <- sum(lag_period_layers, na.rm = TRUE)
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
}
# Combine the lagged sum rasters into a single raster stack, excluding empty rasters
lagged_sum_stack <- rast(lagged_sum_rasters)
# Set the names for the layers in the lagged sum stack
names(lagged_sum_stack) <- names(raster_stack)[!is.na(lagged_sum_rasters)]
return(lagged_sum_stack)
}
# Calculate the 6-month lagged sum for each period in the raster stack
lagged_rainfall_sum <- calculate_lagged_sum(Rainfall_res, num_months = 6)
# Remove the first 6 layers from the raster stack since they are empty
lagged_rainfall_sum_filtered <- lagged_rainfall_sum[[7:nlyr(lagged_rainfall_sum)]]
# check the result
print(lagged_rainfall_sum_filtered)
## class : SpatRaster
## dimensions : 144, 144, 38 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2021-05-01, 2021-06-01, 2021-07-01, 2021-08-01, 2021-09-01, 2021-10-01, ...
## min values : 172.7856, 105.5447, 105.4121, 105.1864, 26.20571, 7.070483, ...
## max values : 1512.4689, 1355.9669, 1052.9220, 1051.9214, 1038.13874, 643.716021, ...
names(lagged_rainfall_sum_filtered) <- paste0(names(lagged_rainfall_sum_filtered), "_rain.sum.lag")
plot(lagged_rainfall_sum_filtered)
## We'll have to include lagged_rainfall_sum_filtered in the predictor stack.
rstack
## class : SpatRaster
## dimensions : 144, 144, 26 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : ttcity_u5, ttport_1, bio_1, bio_2, bio_3, bio_4, ...
## min values : 3.350081e-02, 997.9427, 4.333545, 6.546645, 53.85881, 19.75255, ...
## max values : 1.166560e+03, 3043.3459, 28.472235, 15.285786, 93.14099, 266.33594, ...
names(rstack)
## [1] "ttcity_u5" "ttport_1" "bio_1" "bio_2" "bio_3" "bio_4"
## [7] "bio_5" "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [13] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15" "bio_16"
## [19] "bio_17" "bio_18" "bio_19" "MAI_ARE" "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
rstack3 <- c(rstack, lagged_rainfall_sum_filtered)
names(rstack3)
## [1] "ttcity_u5" "ttport_1"
## [3] "bio_1" "bio_2"
## [5] "bio_3" "bio_4"
## [7] "bio_5" "bio_6"
## [9] "bio_7" "bio_8"
## [11] "bio_9" "bio_10"
## [13] "bio_11" "bio_12"
## [15] "bio_13" "bio_14"
## [17] "bio_15" "bio_16"
## [19] "bio_17" "bio_18"
## [21] "bio_19" "MAI_ARE"
## [23] "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
## [27] "2021-05-01_rain.sum.lag" "2021-06-01_rain.sum.lag"
## [29] "2021-07-01_rain.sum.lag" "2021-08-01_rain.sum.lag"
## [31] "2021-09-01_rain.sum.lag" "2021-10-01_rain.sum.lag"
## [33] "2021-11-01_rain.sum.lag" "2021-12-01_rain.sum.lag"
## [35] "2022-01-01_rain.sum.lag" "2022-02-01_rain.sum.lag"
## [37] "2022-03-01_rain.sum.lag" "2022-04-01_rain.sum.lag"
## [39] "2022-05-01_rain.sum.lag" "2022-06-01_rain.sum.lag"
## [41] "2022-07-01_rain.sum.lag" "2022-08-01_rain.sum.lag"
## [43] "2022-09-01_rain.sum.lag" "2022-10-01_rain.sum.lag"
## [45] "2022-11-01_rain.sum.lag" "2022-12-01_rain.sum.lag"
## [47] "2023-01-01_rain.sum.lag" "2023-02-01_rain.sum.lag"
## [49] "2023-03-01_rain.sum.lag" "2023-04-01_rain.sum.lag"
## [51] "2023-05-01_rain.sum.lag" "2023-06-01_rain.sum.lag"
## [53] "2023-07-01_rain.sum.lag" "2023-08-01_rain.sum.lag"
## [55] "2023-09-01_rain.sum.lag" "2023-10-01_rain.sum.lag"
## [57] "2023-11-01_rain.sum.lag" "2023-12-01_rain.sum.lag"
## [59] "2024-01-01_rain.sum.lag" "2024-02-01_rain.sum.lag"
## [61] "2024-03-01_rain.sum.lag" "2024-04-01_rain.sum.lag"
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
#Extract to the point dataset
extr1 <- terra::extract(rstack3, mypts, method = "bilinear")
mypts <- cbind(mypts, extr1)
# Remove the ID column from the dataset
mypts <- mypts[, !names(mypts) %in% "ID"]
head(mypts)
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1 Arusha Arusha 5 2021 -3.36968 36.68808 Maize 469.00 1 0
## 2 Dar es Salaam Temeke 5 2021 -6.85097 39.25672 Maize 485.00 1 0
## 3 Dodoma Majengo 5 2021 -6.17971 35.74109 Maize 501.50 1 0
## 4 Dodoma Kibaigwa 5 2021 -6.08110 36.64645 Maize 375.00 1 0
## 5 Kagera Bukoba 5 2021 -1.33140 31.81293 Maize 426.25 1 0
## 6 Manyara Babati 5 2021 -4.21006 35.74915 Maize 375.00 1 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## sep oct nov dec ttcity_u5 ttport_1 bio_1 bio_2 bio_3 bio_4
## 1 0 0 0 0 6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2 0 0 0 0 0.5137705 1692.300 25.93087 8.970697 67.33757 153.60644
## 3 0 0 0 0 11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4 0 0 0 0 83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5 0 0 0 0 17.1729083 2006.820 20.84789 8.869465 83.92802 38.70414
## 6 0 0 0 0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
## bio_5 bio_6 bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375 882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572 582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717 649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068 768.0339
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18 bio_19
## 1 237.8160 7.36089814 92.10601 482.4729 24.95659498 276.6105 32.88688141
## 2 264.3072 28.32696315 76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 133.9309 0.01660535 115.12207 375.4677 0.08605806 284.0681 0.08605806
## 4 129.1152 0.21909479 100.22821 368.5410 2.45863517 320.1588 4.15077049
## 5 346.9308 43.30159629 58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006 0.91639794 89.44063 381.6907 6.74877687 325.6114 12.35086025
## MAI_ARE MAI_YLD popdens latitude longitude
## 1 3.107393e+02 1.528850e+02 994.5706 -3.36968 36.68808
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097 39.25672
## 3 1.283795e-04 9.139631e-03 352.5410 -6.17971 35.74109
## 4 1.489781e+03 3.828877e+02 101.0565 -6.08110 36.64645
## 5 6.446327e+02 1.834932e+03 280.9803 -1.33140 31.81293
## 6 5.496486e+02 1.917829e+03 219.1393 -4.21006 35.74915
## 2021-05-01_rain.sum.lag 2021-06-01_rain.sum.lag 2021-07-01_rain.sum.lag
## 1 541.4019 482.1614 435.6677
## 2 809.3149 777.8958 722.6213
## 3 637.1213 535.1131 365.9182
## 4 656.0751 567.7319 416.4267
## 5 1149.9932 937.6197 853.1074
## 6 576.1356 490.7897 395.8152
## 2021-08-01_rain.sum.lag 2021-09-01_rain.sum.lag 2021-10-01_rain.sum.lag
## 1 386.3060 320.00676 161.93116
## 2 649.9489 604.59631 337.22359
## 3 183.7701 52.54273 15.42844
## 4 277.1030 153.91512 66.15258
## 5 811.6691 763.24715 615.07899
## 6 289.9778 154.11524 68.29101
## 2021-11-01_rain.sum.lag 2021-12-01_rain.sum.lag 2022-01-01_rain.sum.lag
## 1 101.83447 179.80052 232.6533
## 2 216.69636 288.29559 406.3529
## 3 19.66440 70.12616 416.5519
## 4 49.56118 106.72427 413.7123
## 5 488.90052 605.55387 808.6724
## 6 46.47620 108.69646 245.5580
## 2022-02-01_rain.sum.lag 2022-03-01_rain.sum.lag 2022-04-01_rain.sum.lag
## 1 330.6131 383.2848 540.3073
## 2 490.8561 534.5386 763.3206
## 3 817.0083 917.8080 1000.7087
## 4 665.6779 764.1859 927.5299
## 5 899.5895 957.6149 1138.6710
## 6 441.5905 510.5650 641.9617
## 2022-05-01_rain.sum.lag 2022-06-01_rain.sum.lag 2022-07-01_rain.sum.lag
## 1 530.1299 455.6274 403.9131
## 2 816.5829 729.6579 636.5197
## 3 997.3701 947.9434 601.5170
## 4 958.6012 903.3041 595.9210
## 5 1219.1735 1115.3495 905.1190
## 6 643.2443 581.5847 444.9141
## 2022-08-01_rain.sum.lag 2022-09-01_rain.sum.lag 2022-10-01_rain.sum.lag
## 1 305.5547 252.9373 100.53888
## 2 552.9257 495.7899 268.19916
## 3 201.0583 100.2519 14.64072
## 4 344.1166 244.4002 79.55153
## 5 853.5636 777.4608 541.86704
## 6 248.8724 179.9022 48.53265
## 2022-11-01_rain.sum.lag 2022-12-01_rain.sum.lag 2023-01-01_rain.sum.lag
## 1 118.72860 197.4710 209.2468
## 2 249.91483 366.5402 424.9013
## 3 24.33803 210.4590 364.0985
## 4 63.76563 237.8550 386.3680
## 5 469.63839 598.3139 701.7206
## 6 63.04448 191.9892 242.6080
## 2023-02-01_rain.sum.lag 2023-03-01_rain.sum.lag 2023-04-01_rain.sum.lag
## 1 215.1493 279.3411 509.3222
## 2 470.6298 548.0837 839.2660
## 3 459.6726 564.8242 612.5806
## 4 488.0359 587.2738 698.1311
## 5 667.8848 832.3617 977.3868
## 6 281.6344 387.7473 521.9884
## 2023-05-01_rain.sum.lag 2023-06-01_rain.sum.lag 2023-07-01_rain.sum.lag
## 1 506.2685 448.0443 434.9563
## 2 867.6821 794.6204 712.8424
## 3 603.2209 416.3525 262.7130
## 4 703.3618 531.7645 383.2904
## 5 939.2760 855.6117 744.2889
## 6 509.0339 383.3928 332.5788
## 2023-08-01_rain.sum.lag 2023-09-01_rain.sum.lag 2023-10-01_rain.sum.lag
## 1 430.3774 366.50878 151.77184
## 2 671.7647 605.81817 375.59469
## 3 167.1400 61.98845 14.85932
## 4 281.9979 182.81237 74.83833
## 5 701.5641 576.55398 478.95586
## 6 293.7072 187.82502 55.57329
## 2023-11-01_rain.sum.lag 2023-12-01_rain.sum.lag 2024-01-01_rain.sum.lag
## 1 265.66469 299.4620 435.0957
## 2 742.63087 865.3863 1088.0394
## 3 61.64418 263.3634 560.4381
## 4 140.44970 310.9307 640.9717
## 5 677.93283 840.0104 941.7438
## 6 137.00931 245.0625 448.6259
## 2024-02-01_rain.sum.lag 2024-03-01_rain.sum.lag 2024-04-01_rain.sum.lag
## 1 531.3442 668.3936 896.7951
## 2 1108.6897 1277.2697 1582.9947
## 3 754.5153 856.2409 923.1968
## 4 804.6320 935.2507 1026.7195
## 5 1023.2543 1002.7834 1183.9751
## 6 603.8745 714.4857 863.0247
## 2024-05-01_rain.sum.lag 2024-06-01_rain.sum.lag
## 1 821.3999 770.3854
## 2 1235.6993 1103.4509
## 3 875.1995 673.5048
## 4 951.3725 780.1154
## 5 1003.6453 792.8721
## 6 793.5998 682.0563
Here we extract sum of lag rainfall for each row under the column rain.sum.lag
mypts_df <- as.data.frame(mypts)
# Define the function to obtain sum of lag rainfall from corresponding rasters to mypts under rain.sum.lag column (for each row)
# Each extraction has to match the month and year
get_rain_sum_row <- function(current_date, mypts_row) {
# Extract the rainfall value for the current date
rain_sum <- mypts_row[[paste0(current_date, "_rain.sum.lag")]]
return(rain_sum)
}
# Loop through each row and obtain the rainfall sum for each month and year
for (i in 1:nrow(mypts_df)) {
# Extract relevant data for the current row
month <- mypts_df$Month[i]
year <- mypts_df$Year[i]
current_date <- paste0(year, "-", sprintf("%02d", month), "-01") # Format date correctly
# Pass necessary data to the function
rain_sum <- get_rain_sum_row(current_date, mypts_df[i, ])
# Update the rain.sum.lag column
mypts_df$rain.sum.lag[i] <- rain_sum
}
# Update the SpatVector with the new rain.avg column
mypts$rain.sum.lag <- mypts_df$rain.sum.lag
# I'll drop the dates with rain.sum.lag from mypts, seems redundant
column_indices <- grep("^202[0-4]-", names(mypts))
mypts <- mypts[, -column_indices]
names(mypts)
## [1] "Region" "Market" "Month" "Year" "Latitude"
## [6] "Longitude" "Crop" "pkg" "maize" "rice"
## [11] "sorghum" "bmillet" "fmillet" "wheat" "beans"
## [16] "potato" "jan" "feb" "mar" "apr"
## [21] "may" "jun" "jul" "aug" "sep"
## [26] "oct" "nov" "dec" "ttcity_u5" "ttport_1"
## [31] "bio_1" "bio_2" "bio_3" "bio_4" "bio_5"
## [36] "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [41] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15"
## [46] "bio_16" "bio_17" "bio_18" "bio_19" "MAI_ARE"
## [51] "MAI_YLD" "popdens" "latitude" "longitude" "rain.sum.lag"
#define Month as a factor
#mypts$Month <- as.factor(mypts$Month)
#levels(mypts$Month)
#We'll define month as an interger instead.
# drop levels that don't exist in Crop field
mypts$Crop <- mypts$Crop[,drop=TRUE]
levels(mypts$Crop)
## [1] "Maize" "Rice" "Sorghum" "B.Millet" "F.Millet" "Wheat" "Beans"
## [8] "Potato"
Coefficient estimates from the linear model provide a detailed insight into the relationship between each predictor and the response variable.
# Fit the linear model
lm_model <- lm(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = mypts)
summary(lm_model)
##
## Call:
## lm(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet +
## wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 +
## bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 +
## bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 +
## bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude +
## Latitude + rain.sum.lag, data = mypts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1491.81 -239.93 -10.63 235.20 1602.93
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.592e+05 1.221e+04 -37.593 < 2e-16 ***
## maize -5.132e+01 1.850e+01 -2.774 0.005549 **
## rice 1.397e+03 1.850e+01 75.482 < 2e-16 ***
## sorghum 3.847e+02 1.957e+01 19.660 < 2e-16 ***
## bmillet 4.745e+02 2.103e+01 22.556 < 2e-16 ***
## fmillet 7.905e+02 1.933e+01 40.885 < 2e-16 ***
## wheat 9.527e+02 2.126e+01 44.821 < 2e-16 ***
## beans 1.516e+03 1.852e+01 81.852 < 2e-16 ***
## potato NA NA NA NA
## Month 6.676e+00 1.886e+00 3.540 0.000403 ***
## Year 2.186e+02 5.854e+00 37.343 < 2e-16 ***
## ttcity_u5 1.881e+00 1.588e-01 11.850 < 2e-16 ***
## ttport_1 -1.918e+00 1.685e-01 -11.388 < 2e-16 ***
## bio_1 -2.372e+03 1.891e+02 -12.542 < 2e-16 ***
## bio_2 -2.675e+03 2.351e+02 -11.377 < 2e-16 ***
## bio_3 2.262e+02 1.925e+01 11.747 < 2e-16 ***
## bio_4 2.172e+01 4.038e+00 5.380 7.73e-08 ***
## bio_5 1.992e+03 1.793e+02 11.107 < 2e-16 ***
## bio_6 -1.950e+03 1.923e+02 -10.139 < 2e-16 ***
## bio_7 NA NA NA NA
## bio_8 9.181e+02 9.258e+01 9.917 < 2e-16 ***
## bio_9 -6.182e+01 7.954e+01 -0.777 0.437097
## bio_10 -1.244e+03 2.052e+02 -6.061 1.43e-09 ***
## bio_11 2.754e+03 3.005e+02 9.165 < 2e-16 ***
## bio_12 1.155e+00 2.783e-01 4.150 3.38e-05 ***
## bio_13 -7.293e-02 9.213e-01 -0.079 0.936911
## bio_14 5.699e+01 1.259e+01 4.526 6.14e-06 ***
## bio_15 2.089e+01 2.133e+00 9.796 < 2e-16 ***
## bio_16 -1.014e+00 7.661e-01 -1.323 0.185851
## bio_17 -1.791e+01 3.588e+00 -4.991 6.19e-07 ***
## bio_18 1.459e-01 2.512e-01 0.581 0.561329
## bio_19 4.667e+00 2.700e+00 1.729 0.083933 .
## MAI_ARE -2.628e-02 3.250e-02 -0.808 0.418874
## MAI_YLD 1.432e-01 1.704e-02 8.406 < 2e-16 ***
## Longitude 9.447e+01 3.530e+01 2.676 0.007466 **
## Latitude 1.300e+01 3.535e+01 0.368 0.713134
## rain.sum.lag -2.220e-01 1.938e-02 -11.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 374.6 on 5718 degrees of freedom
## Multiple R-squared: 0.7403, Adjusted R-squared: 0.7388
## F-statistic: 479.5 on 34 and 5718 DF, p-value: < 2.2e-16
First check if there are any predictors with NA values
for(column in seq_along(mypts)){
if(any(is.na(mypts[column]))){
print(paste0("Column: ", colnames(mypts)[column], " has at least one NA value"))
}
}
#There are no columns with missing values
Data from May 2021 - Dec 2023 will be used for model training while more recent data from Jan - June 2024 will be used for Validation.
# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
head(training_data)
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1 Arusha Arusha 5 2021 -3.36968 36.68808 Maize 469.00 1 0
## 2 Dar es Salaam Temeke 5 2021 -6.85097 39.25672 Maize 485.00 1 0
## 3 Dodoma Majengo 5 2021 -6.17971 35.74109 Maize 501.50 1 0
## 4 Dodoma Kibaigwa 5 2021 -6.08110 36.64645 Maize 375.00 1 0
## 5 Kagera Bukoba 5 2021 -1.33140 31.81293 Maize 426.25 1 0
## 6 Manyara Babati 5 2021 -4.21006 35.74915 Maize 375.00 1 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## sep oct nov dec ttcity_u5 ttport_1 bio_1 bio_2 bio_3 bio_4
## 1 0 0 0 0 6.5704844 1405.327 19.27741 10.934885 68.03365 169.49426
## 2 0 0 0 0 0.5137705 1692.300 25.93087 8.970697 67.33757 153.60644
## 3 0 0 0 0 11.0485593 1752.489 22.37704 12.001499 69.67643 153.46371
## 4 0 0 0 0 83.7810446 1788.311 20.99961 12.303768 70.91631 155.25267
## 5 0 0 0 0 17.1729083 2006.820 20.84789 8.869465 83.92802 38.70414
## 6 0 0 0 0 133.5790574 1526.875 19.98732 10.788815 71.03883 157.18046
## bio_5 bio_6 bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 1 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375 882.1826
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572 582.4022
## 4 29.32247 11.97287 17.34961 22.51508 19.21250 22.53832 18.93717 649.1973
## 5 25.68171 15.11708 10.56463 21.07716 20.26752 21.19770 20.26752 1905.0292
## 6 27.34785 12.16727 15.18059 21.20938 18.19802 21.26028 17.67068 768.0339
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18 bio_19
## 1 237.8160 7.36089814 92.10601 482.4729 24.95659498 276.6105 32.88688141
## 2 264.3072 28.32696315 76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 133.9309 0.01660535 115.12207 375.4677 0.08605806 284.0681 0.08605806
## 4 129.1152 0.21909479 100.22821 368.5410 2.45863517 320.1588 4.15077049
## 5 346.9308 43.30159629 58.29730 857.7995 171.49365306 631.5838 171.49365306
## 6 149.2006 0.91639794 89.44063 381.6907 6.74877687 325.6114 12.35086025
## MAI_ARE MAI_YLD popdens latitude longitude rain.sum.lag
## 1 3.107393e+02 1.528850e+02 994.5706 -3.36968 36.68808 541.4019
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097 39.25672 809.3149
## 3 1.283795e-04 9.139631e-03 352.5410 -6.17971 35.74109 637.1213
## 4 1.489781e+03 3.828877e+02 101.0565 -6.08110 36.64645 656.0751
## 5 6.446327e+02 1.834932e+03 280.9803 -1.33140 31.81293 1149.9932
## 6 5.496486e+02 1.917829e+03 219.1393 -4.21006 35.74915 576.1356
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
head(validation_data)
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1 Dar es Salaam Ilala 1 2024 -6.82941 39.26289 Maize 775.0000 1 0
## 2 Dar es Salaam Temeke 1 2024 -6.85097 39.25672 Maize 850.0000 1 0
## 3 Kilimanjaro Moshi 1 2024 -3.34865 37.34352 Maize 850.0000 1 0
## 4 Singida Singida 1 2024 -4.81261 34.74280 Maize 700.0000 1 0
## 5 Arusha Arusha 1 2024 -3.36968 36.68808 Maize 709.0909 1 0
## 6 Dodoma Majengo 1 2024 -6.17971 35.74109 Maize 690.4545 1 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## sep oct nov dec ttcity_u5 ttport_1 bio_1 bio_2 bio_3 bio_4
## 1 0 0 0 0 0.4051143 1689.367 25.92678 8.978264 67.42460 152.9108
## 2 0 0 0 0 0.5137705 1692.300 25.93087 8.970697 67.33757 153.6064
## 3 0 0 0 0 11.3442758 1450.131 22.29721 11.005590 67.75448 180.7284
## 4 0 0 0 0 165.2321068 1606.655 20.38367 11.683626 71.57368 137.7943
## 5 0 0 0 0 6.5704844 1405.327 19.27741 10.934885 68.03365 169.4943
## 6 0 0 0 0 11.0485593 1752.489 22.37704 12.001499 69.67643 153.4637
## bio_5 bio_6 bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 1 32.19589 18.88015 13.31574 26.54073 24.05313 27.71579 24.02143 1122.8900
## 2 32.17974 18.85807 13.32167 26.54696 24.04985 27.72815 24.01825 1141.3144
## 3 31.30237 15.06427 16.23809 22.88764 20.26380 24.25183 19.85315 938.0520
## 4 28.10475 11.78088 16.32386 21.29394 18.36286 21.69755 18.36286 662.1342
## 5 27.99563 11.93202 16.06362 20.02968 17.25298 20.97571 16.90375 882.1826
## 6 30.61132 13.38613 17.22519 23.67508 20.17572 23.87677 20.17572 582.4022
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18 bio_19
## 1 258.0938 28.00268809 75.59542 582.2198 89.31864258 266.4441 103.38645312
## 2 264.3072 28.32696315 76.22544 594.2531 90.46537640 268.0798 105.37909361
## 3 277.1083 14.38640622 97.50828 541.7736 53.36366989 211.4703 72.58868003
## 4 140.9090 0.00000000 104.78967 387.3265 0.31403068 192.0734 0.31403068
## 5 237.8160 7.36089814 92.10601 482.4729 24.95659498 276.6105 32.88688141
## 6 133.9309 0.01660535 115.12207 375.4677 0.08605806 284.0681 0.08605806
## MAI_ARE MAI_YLD popdens latitude longitude rain.sum.lag
## 1 3.490943e-03 1.426787e-02 8044.9216 -6.82941 39.26289 1085.9139
## 2 1.631942e-03 6.669931e-03 7719.2284 -6.85097 39.25672 1088.0394
## 3 1.654091e+03 3.030923e+03 522.6697 -3.34865 37.34352 508.5944
## 4 1.631720e+01 1.052680e+03 254.1591 -4.81261 34.74280 613.3819
## 5 3.107393e+02 1.528850e+02 994.5706 -3.36968 36.68808 435.0957
## 6 1.283795e-04 9.139631e-03 352.5410 -6.17971 35.74109 560.4381
The tuneRF function in the randomForest package is used to tune the mtry parameter, which is the number of variables randomly sampled as candidates at each split in the random forest. The function requires a data frame of predictor variables and a response variable.
# Convert training_data data to data frame
mypts_df <- as.data.frame(training_data)
trf <- tuneRF(x=mypts_df[,1:ncol(mypts_df)], # Prediction variables
y=mypts_df$pkg) # Response variable
## mtry = 18 OOB error = 1330.723
## Searching left ...
## mtry = 9 OOB error = 6914.108
## -4.195754 0.05
## Searching right ...
## mtry = 36 OOB error = 164.153
## 0.8766437 0.05
## mtry = 55 OOB error = 96.79918
## 0.4103114 0.05
Based on the output from tuneRF, you can observe that the mtry value that gives the lowest Out-of-Bag (OOB) error. To build the first random forest model, we will use this mtry value.
(mintree <- trf[which.min(trf[,2]),1])
## [1] 55
Here, the model is fitted using the randomForest function to build a predictive model for food commodity prices. The model takes the response variable, the prediction variables and the optimal number of variables to consider at each split. The goal is to generate Variable importance scores which will help us understand which variables have the most significant impact on the response variable, enabling us to interpret the model and possibly simplify it by focusing on the most important predictors.
# Create the random forest model
rf1 <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = training_data,mtry=mintree,importance=TRUE,na.rm=TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf1
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 + bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude + Latitude + rain.sum.lag, data = training_data, mtry = mintree, importance = TRUE, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 36
##
## Mean of squared residuals: 24367.66
## % Var explained: 95.33
varImpPlot(rf1)
## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 155.9889
This calculates the RMSE of the tree in the Random Forest model that has the lowest OOB error.
importance_metrics <- importance(rf1, type=1) # %IncMSE
impvar <- rownames(importance_metrics)[order(importance_metrics[, 1], decreasing=TRUE)]
impvar
## [1] "Year" "Month" "beans" "rice" "Longitude"
## [6] "sorghum" "rain.sum.lag" "wheat" "fmillet" "bmillet"
## [11] "bio_12" "bio_18" "MAI_YLD" "Latitude" "ttcity_u5"
## [16] "bio_15" "MAI_ARE" "ttport_1" "bio_3" "bio_2"
## [21] "bio_8" "bio_1" "bio_5" "bio_4" "bio_13"
## [26] "bio_7" "bio_16" "bio_10" "bio_11" "bio_19"
## [31] "bio_14" "maize" "bio_6" "potato" "bio_9"
## [36] "bio_17"
# Get the top 20 variables
top_20_vars <- impvar[1:20]
top_20_vars
## [1] "Year" "Month" "beans" "rice" "Longitude"
## [6] "sorghum" "rain.sum.lag" "wheat" "fmillet" "bmillet"
## [11] "bio_12" "bio_18" "MAI_YLD" "Latitude" "ttcity_u5"
## [16] "bio_15" "MAI_ARE" "ttport_1" "bio_3" "bio_2"
node_purity <- importance(rf1, type=2) # IncNodePurity
# Sort variables by importance (IncNodePurity)
node_purity_sorted <- sort(node_purity[,1], decreasing = TRUE)
node_purity_sorted
## rice beans Year wheat fmillet Month
## 545262150 472837663 380711166 130780343 130524754 118049778
## potato bio_12 maize rain.sum.lag Longitude sorghum
## 48764484 46876442 43269385 41339371 40493227 38007061
## bmillet bio_3 bio_7 MAI_YLD ttcity_u5 bio_18
## 37037032 26611939 25621683 23047045 20340459 17334860
## Latitude bio_15 MAI_ARE ttport_1 bio_6 bio_2
## 13870649 13354723 12562599 11705550 9671336 8905350
## bio_5 bio_4 bio_9 bio_10 bio_19 bio_13
## 7887209 7186178 6312415 5692600 5546136 5381328
## bio_8 bio_1 bio_11 bio_16 bio_14 bio_17
## 5190873 5156708 4920759 4912567 4732035 3084859
# Select the top 20 important variables
top_vars <- names(node_purity_sorted)[1:20]
print(top_vars)
## [1] "rice" "beans" "Year" "wheat" "fmillet"
## [6] "Month" "potato" "bio_12" "maize" "rain.sum.lag"
## [11] "Longitude" "sorghum" "bmillet" "bio_3" "bio_7"
## [16] "MAI_YLD" "ttcity_u5" "bio_18" "Latitude" "bio_15"
rf1$importanceSD
## maize rice sorghum bmillet fmillet wheat
## 1882.1577 2770.9228 457.2999 630.7592 1639.2469 1556.6641
## beans potato Month Year ttcity_u5 ttport_1
## 2955.5197 1928.6726 273.3616 488.2066 277.1534 218.3697
## bio_1 bio_2 bio_3 bio_4 bio_5 bio_6
## 120.2380 214.5229 584.6667 181.9721 168.0418 316.5164
## bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 722.7443 115.8098 279.8230 155.6259 137.4150 828.6967
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 134.2437 129.5546 289.8310 147.2297 137.7304 274.4271
## bio_19 MAI_ARE MAI_YLD Longitude Latitude rain.sum.lag
## 195.6188 231.7859 433.6816 322.8656 276.0473 168.4705
In this section, we aim to refine our model by selecting the most important variables. We will review the importance metrics (%IncMSE and IncNodePurity) to identify the variables that contribute the most to the model’s predictive power. Our focus will be on variables with higher importance values to ensure a more efficient and interpretable model.
# Estimate more parsimonious specification
rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttport_1 +
bio_3 + bio_6 + bio_9 + bio_12 +
rain.sum.lag,
data=training_data, na.rm=TRUE)
rf
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttport_1 + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 29815.16
## % Var explained: 94.29
# evaluate
varImpPlot(rf)
(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 172.539
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")
These are prediction plots for each food commodities (maize, beans, rice, sorghum, bmillet, fmillet, wheat, potato) with their respective month of the year 2023.
year <- 2023
Note: we must set the rain.sum.lag variables for each month we are predicting
#Maize
# Create an empty list to store predictions for maize
predict_for_month <- function(month){
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")] # Remember to change depending on year
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_maize <- data.frame(
maize = 1, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_maize, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_maize <- lapply(1:12, predict_for_month)
# Extract pixel values from predictions_maize
maize_values <- unlist(lapply(predictions_maize, values))
# Get min and max values
min_maize <- min(maize_values, na.rm = TRUE)
max_maize <- max(maize_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Create a 3x4 matrix of plots
par(mar = c(0, 0, 0, 0)) # Set margins to 0 for inner plots
for (i in 1:12) {
plot(predictions_maize[[i]], main = paste("Maize prices", toupper(i), year),
zlim = c(min_maize, max_maize), col = color_palette, breaks = seq(min_maize, max_maize, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_maize, max_maize), n = 4)
# Reset plot layout for the legend
layout(matrix(1))
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_maize, max_maize), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Maize Price (Tsh)", side = 1, line = 2, cex = 0.9))
# Beans
# Function to predict beans for a given month
predict_for_beans <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_beans <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 1, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_beans, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_beans <- lapply(1:12, predict_for_beans)
# Extract pixel values from predictions_beans
bean_values <- unlist(lapply(predictions_beans, values))
min_bean <- min(bean_values, na.rm = TRUE)
max_bean <- max(bean_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot beans prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_beans[[i]], main = paste("Beans prices", toupper(i), year),
zlim = c(min_bean, max_bean), col = color_palette, breaks = seq(min_bean, max_bean, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bean, max_bean), n = 4)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bean, max_bean), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Beans Price (Tsh)", side = 1, line = 2, cex = 0.9))
# Rice
# Function to predict rice prices for a given month
predict_for_rice <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_rice <- data.frame(
maize = 0, rice = 1, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_rice, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_rice <- lapply(1:12, predict_for_rice)
# Extract pixel values from predictions_rice
rice_values <- unlist(lapply(predictions_rice, values))
min_rice <- min(rice_values, na.rm = TRUE)
max_rice <- max(rice_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot rice prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_rice[[i]], main = paste("Rice prices", toupper(i), year),
zlim = c(min_rice, max_rice), col = color_palette, breaks = seq(min_rice, max_rice, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_rice, max_rice), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_rice, max_rice), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Rice Price (Tsh)", side = 1, line = 2, cex = 0.9))
# Function to predict sorghum prices for a given month
predict_for_sorghum <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_sorghum <- data.frame(
maize = 0, rice = 0, sorghum = 1, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = 2023
)
pred <- predict(newstack, rf, const = const_sorghum, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_sorghum <- lapply(1:12, predict_for_sorghum)
# Extract pixel values from predictions_sorghum
sorghum_values <- unlist(lapply(predictions_sorghum, values))
min_sorghum <- min(sorghum_values, na.rm = TRUE)
max_sorghum <- max(sorghum_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 300
# Loop through each month to plot sorghum prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_sorghum[[i]], main = paste("Sorghum prices", toupper(i), year),
zlim = c(min_sorghum, max_sorghum), col = color_palette, breaks = seq(min_sorghum, max_sorghum, by = break_interval), legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_sorghum, max_sorghum), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_sorghum, max_sorghum), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Sorghum Price (Tsh)", side = 1, line = 2, cex = 0.9))
# bmillet
# Function to predict bmillet prices for a given month
predict_for_bmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_bmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 1, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_bmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_bmillet <- lapply(1:12, predict_for_bmillet)
# Extract pixel values from predictions_bmillet
bmillet_values <- unlist(lapply(predictions_bmillet, values))
min_bmillet <- min(bmillet_values, na.rm = TRUE)
max_bmillet <- max(bmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot bmillet prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_bmillet[[i]], main = paste("Bmillet prices", toupper(i), year),
zlim = c(min_bmillet, max_bmillet), col = color_palette,
breaks = seq(min_bmillet, max_bmillet, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bmillet, max_bmillet), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bmillet, max_bmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Bulrush Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))
#fmillet
# Function to predict fmillet prices for a given month
predict_for_fmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_fmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 1, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_fmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_fmillet <- lapply(1:12, predict_for_fmillet)
# Extract pixel values from predictions_fmillet
fmillet_values <- unlist(lapply(predictions_fmillet, values))
min_fmillet <- min(fmillet_values, na.rm = TRUE)
max_fmillet <- max(fmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot fmillet prices
break_interval <- 300
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_fmillet[[i]], main = paste("Fmillet prices", toupper(i), year),
zlim = c(min_fmillet, max_fmillet), col = color_palette,
breaks = seq(min_fmillet, max_fmillet, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_fmillet, max_fmillet), n = 3)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_fmillet, max_fmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Finger Millet Price (Tsh)", side = 1, line = 2, cex = 0.9))
#Wheat
# Function to predict wheat prices for a given month
predict_for_wheat <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_wheat <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 1, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_wheat, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_wheat <- lapply(1:12, predict_for_wheat)
# Extract pixel values from predictions_wheat
wheat_values <- unlist(lapply(predictions_wheat, values))
min_wheat <- min(wheat_values, na.rm = TRUE)
max_wheat <- max(wheat_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Define the break interval for both plot and legend
break_interval <- 100
# Loop through each month to plot wheat prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_wheat[[i]], main = paste("Wheat prices", toupper(i), year),
zlim = c(min_wheat, max_wheat), col = color_palette,
breaks = seq(min_wheat, max_wheat, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_wheat, max_wheat), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_wheat, max_wheat), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Wheat Price (Tsh)", side = 1, line = 2, cex = 0.9))
#potatoes
# Function to predict potato prices for a given month
predict_for_potato <- function(month) {
rain_sum_lag <- rstack3[paste0("2023-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_potato <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 1,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_potato, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_potato <- lapply(1:12, predict_for_potato)
# Extract pixel values from predictions_potato
potato_values <- unlist(lapply(predictions_potato, values))
min_potato <- min(potato_values, na.rm = TRUE)
max_potato <- max(potato_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot potato prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_potato[[i]], main = paste("Potato prices", toupper(i), year),
zlim = c(min_potato, max_potato), col = color_palette,
breaks = seq(min_potato, max_potato, by = break_interval),
legend = FALSE, axes = FALSE)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_potato, max_potato), n = 5) # Adjust n as needed
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_potato, max_potato), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Potato Price (Tsh)", side = 1, line = 2, cex = 0.9))
pred<-predict(object=rf, newdata=validation_data)
actual<-validation_data$pkg
result<-data.frame(actual=actual, predicted=pred)
mse <- mean((actual - pred)^2, na.rm=TRUE)
paste('Mean Squared Error:', mse)
## [1] "Mean Squared Error: 136117.087211266"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error: 176.24848662951"
#Save predicted & observed yield
write.csv(result, "result.csv")
#reading result.csv file (predicted vs observed)
rslt <- read.csv("result.csv", header=T)
print(names(rslt))
## [1] "X" "actual" "predicted"
#RMSE predicting from rf - predicited vs observed
rf.rmse<-round(sqrt(mean( (rslt$actual-rslt$predicted)^2 , na.rm = TRUE )),2)
print(rf.rmse)
## [1] 368.94
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,3)
print(rf.r2)
## [1] 0.8
range(actual)
## [1] 350 4050
range(pred)
## [1] 631.9554 3268.8138
#plotting predicted Vs observed
ggplot(result, aes(x=actual, y=predicted), alpha=0.6) +
geom_point(colour = "blue", size = 1.4, alpha=0.6) +
ggtitle('Random Forest "Wholesale Grain Prices in Tanzania"') +
scale_x_continuous("Observed Price (Tsh)",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
scale_y_continuous("Predicted Price (Tsh)",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
axis.text.x = element_text(size = 8)) +
geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
annotate("text", x = 300, y = 4500, label = paste("RMSE:", rf.rmse)) +
annotate("text", x = 300, y = 4200, label = paste("R^2: ", rf.r2), parse = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(stats)
mypts_df$pred <- stats::predict(rf)
rsq <- function (obs, pred) cor(obs, pred, use = 'complete.obs') ^ 2
RMSE <- function(obs, pred){sqrt(mean((pred - obs)^2, na.rm = TRUE))}
fr2_rsq <- rsq(mypts_df$pkg, mypts_df$pred) %>% round(digits = 2)
fr2_rmse <- RMSE(mypts_df$pkg, mypts_df$pred) %>% round(digits = 0)
Price_fit_plot <- ggplot(data = mypts_df, aes(x = pkg, y = pred)) +
geom_point(colour = "blue", size = 1.4 ,alpha=0.6) +
ggtitle('Observed vs Predicted "Wholesale Grain Prices in Tanzania"') +
geom_abline(slope = 1, alpha=0.3) +
annotate('text', x = 150, y = 4500, label = paste0("R^{2}==", fr2_rsq), parse = TRUE, size=3) +
annotate('text', x = 150, y = 4200, label = paste0("RMSE==", fr2_rmse), parse = TRUE, size=3) +
labs(x = "Observed Price (Tsh)", y = "Predicted Price (Tsh)") +
xlim(0, 5000) + ylim(0, 5000)
Price_fit_plot
Plot Partial dependence of all the variables used except food commodities and months.
library(caret)
var_importance <- varImp(rf)
impvar <- rownames(var_importance)[order(var_importance[, 1], decreasing=TRUE)]
op <- par(mfrow=c(2, 4))
# exclude food commodities and months
predictors_to_plot <- setdiff(impvar, c("maize", "rice", "sorghum", "bmillet", "fmillet", "wheat", "beans", "potato", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))
for (i in seq_along(predictors_to_plot)) {
partialPlot(rf, as.data.frame(training_data), predictors_to_plot[i], xlab=predictors_to_plot[i],
main="Partial Dependence")
}